diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
index bf8bab6dde232..a0eb0b72df2b3 100644
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -5,9 +5,9 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 
 # Set CUDA architecture lists to match x86 build_cuda.sh
 if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
     export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
 fi
@@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
     echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
     echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
     export USE_SYSTEM_NCCL=1
@@ -42,13 +41,9 @@ else
         echo "Bundling CUDA libraries with wheel for aarch64."
     else
         echo "Using nvidia libs from pypi for aarch64."
-        # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
-        # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
         echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
         export USE_NVIDIA_PYPI_LIBS=1
     fi
 
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
index 4bb9c64ea7772..d4afea81ac0b4 100755
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -138,6 +138,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
     folder = os.path.dirname(wheel_path)
     os.mkdir(f"{folder}/tmp")
     os.system(f"unzip {wheel_path} -d {folder}/tmp")
+    # Delete original wheel since it will be repackaged
+    os.system(f"rm {wheel_path}")
 
     # Check if we should use PyPI NVIDIA libraries or bundle system libraries
     use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
@@ -211,7 +213,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
         ]
 
         # CUDA version-specific libraries
-        if "130" in desired_cuda:
+        if "13" in desired_cuda:
+            minor_version = desired_cuda[-1]
             version_specific_libs = [
                 "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
                 "/usr/local/cuda/lib64/libcublas.so.13",
@@ -221,7 +224,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                 "/usr/local/cuda/lib64/libcusolver.so.12",
                 "/usr/local/cuda/lib64/libnvJitLink.so.13",
                 "/usr/local/cuda/lib64/libnvrtc.so.13",
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
             ]
         elif "12" in desired_cuda:
             # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
@@ -237,6 +240,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                 "/usr/local/cuda/lib64/libnvrtc.so.12",
                 f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
             ]
+        else:
+            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
 
         # Combine all libraries
         libs_to_copy = common_libs + version_specific_libs
@@ -275,14 +280,7 @@ def complete_wheel(folder: str) -> str:
             f"/{folder}/dist/{repaired_wheel_name}",
         )
     else:
-        repaired_wheel_name = wheel_name.replace(
-            "linux_aarch64", "manylinux_2_28_aarch64"
-        )
-        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
-        os.rename(
-            f"/{folder}/dist/{wheel_name}",
-            f"/{folder}/dist/{repaired_wheel_name}",
-        )
+        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
 
     print(f"Copying {repaired_wheel_name} to artifacts")
     shutil.copy2(
@@ -319,7 +317,7 @@ def parse_arguments():
     ).decode()
 
     print("Building PyTorch wheel")
-    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    build_vars = ""
     # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
     if enable_cuda:
         build_vars += "MAX_JOBS=5 "
diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py
index 7a4715d330060..52525f14460da 100755
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
         try:
             with socket.create_connection((addr, port), timeout=timeout):
                 return
-        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
+        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
             if i == attempt_cnt - 1:
                 raise
             time.sleep(timeout)
@@ -1004,7 +1004,7 @@ def parse_arguments():
         install_condaforge_python(host, args.python_version)
         sys.exit(0)
 
-    python_version = args.python_version if args.python_version is not None else "3.9"
+    python_version = args.python_version if args.python_version is not None else "3.10"
 
     if args.use_torch_from_pypi:
         configure_system(host, compiler=args.compiler, python_version=python_version)
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 48be0cf538054..6ebff8d531e9f 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -214,8 +214,7 @@ case "$tag" in
     TRITON=yes
     ;;
   pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    # TODO (huydhn): Upgrade this to Python >= 3.10
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
     KATEX=yes
@@ -263,13 +262,10 @@ case "$tag" in
     TRITON_CPU=yes
     ;;
   pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    PYTHON_VERSION=3.10
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+    PYTHON_VERSION=3.10
     CUDA_VERSION=12.8.1
     ;;
   pytorch-linux-jammy-aarch64-py3.10-gcc11)
diff --git a/.ci/docker/centos-rocm/Dockerfile b/.ci/docker/centos-rocm/Dockerfile
index 07788af580e3a..4fa4ca29886e6 100644
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@@ -59,9 +59,13 @@ ENV INSTALLED_VISION ${VISION}
 
 # Install rocm
 ARG ROCM_VERSION
+RUN mkdir ci_commit_pins
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
 COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
-RUN rm install_rocm.sh
+RUN rm install_rocm.sh common_utils.sh
+RUN rm -r ci_commit_pins
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh
diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt
index 0e527f4682297..0a30a6037a05c 100644
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@@ -1 +1 @@
-56392aa978594cc155fa8af48cd949f5b5f1823a
+e0dda9059d082537cee36be6c5e4fe3b18c880c0
diff --git a/.ci/docker/ci_commit_pins/huggingface-requirements.txt b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
index 66e5dbdfb1bb1..f4f3830136eb6 100644
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@@ -1,2 +1,2 @@
-transformers==4.54.0
+transformers==4.56.0
 soxr==0.5.0
diff --git a/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt b/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
new file mode 100644
index 0000000000000..c45f46af95d03
--- /dev/null
+++ b/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
@@ -0,0 +1 @@
+7fe50dc3da2069d6645d9deb8c017a876472a977
diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh
index becd2264e3958..fb168acd4febe 100755
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@@ -42,22 +42,27 @@ install_pip_dependencies() {
   # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
   # numba and scipy version used in PyTorch CI
   conda_run pip uninstall -y numba scipy
+  # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
+  pip_install yaspin==3.1.0
 
   popd
 }
 
 setup_executorch() {
-  pushd executorch
-
   export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
 
   as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
-  popd
 }
 
-clone_executorch
-install_buck2
-install_conda_dependencies
-install_pip_dependencies
-setup_executorch
+if [ $# -eq 0 ]; then
+  clone_executorch
+  install_buck2
+  install_conda_dependencies
+  install_pip_dependencies
+  pushd executorch
+  setup_executorch
+  popd
+else
+  "$@"
+fi
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index 6d8918f79a0af..675a7a3437274 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -2,6 +2,11 @@
 
 set -ex
 
+# for pip_install function
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"
+
 ver() {
     printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }
@@ -109,8 +114,7 @@ EOF
         rm -rf HIP clr
     fi
 
-    # temporary hipblasLT dependency install
-    apt install libmsgpackc2
+    pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
 
     # Cleanup
     apt-get autoclean && apt-get clean
@@ -122,8 +126,8 @@ install_centos() {
   yum update -y
   yum install -y kmod
   yum install -y wget
-  
-  if [[ $OS_VERSION == 9 ]]; then 
+
+  if [[ $OS_VERSION == 9 ]]; then
       dnf install -y openblas-serial
       dnf install -y dkms kernel-headers kernel-devel
   else
@@ -195,6 +199,8 @@ install_centos() {
       sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
   done
 
+  pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
+
   # Cleanup
   yum clean all
   rm -rf /var/cache/yum
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 45fef66fd567f..08687a02530e9 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -93,8 +93,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:
 
-mypy==1.16.0
+mypy==1.16.0 ; platform_system != "Windows"
 # Pin MyPy version because new errors are likely to appear with each release
+# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
@@ -322,8 +323,6 @@ lxml==5.3.0 ; python_version <= "3.12"
 lxml==6.0.0 ; python_version == "3.13"
 #Description: This is a requirement of unittest-xml-reporting
 
-# Python-3.9 binaries
-
 PyGithub==2.3.0
 
 sympy==1.13.3
diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index efe6fb4c949b0..c30ab3e993e94 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
index 681f6fe750510..b517a990a057b 100644
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -52,9 +52,13 @@ ENV INSTALLED_VISION ${VISION}
 
 # Install rocm
 ARG ROCM_VERSION
+RUN mkdir ci_commit_pins
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
 COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
-RUN rm install_rocm.sh
+RUN rm install_rocm.sh common_utils.sh
+RUN rm -r ci_commit_pins
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh
diff --git a/.ci/libtorch/build.sh b/.ci/libtorch/build.sh
index 54ddd905aad05..c2d67f8b1bb29 100644
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@@ -7,4 +7,4 @@ set -ex
 
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
+USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
index 98cfc807e284a..0e2132839adbb 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@@ -41,7 +41,6 @@ def sample_vllm_test_library():
                 "pytest -v -s basic_correctness/test_cumem.py",
                 "pytest -v -s basic_correctness/test_basic_correctness.py",
                 "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
             ],
         },
         "vllm_basic_models_test": {
@@ -68,15 +67,12 @@ def sample_vllm_test_library():
                         "-v",
                         "-s",
                         "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                         "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                         "--ignore=entrypoints/llm/test_collective_rpc.py",
                     ]
                 ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/llm/test_generate.py",
+                "pytest -v -s entrypoints/offline_mode",
             ],
         },
         "vllm_regression_test": {
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
index 8db48065cb052..415e05d07551b 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@@ -66,6 +66,11 @@ class VllmBuildParameters:
         "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
     )
 
+    # the cleaning script to remove torch dependencies from pip
+    cleaning_script: Path = env_path_field(
+        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
+    )
+
     # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
     output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
 
@@ -160,6 +165,7 @@ def run(self):
         logger.info("Running vllm build with inputs: %s", inputs)
         vllm_commit = clone_vllm()
 
+        self.cp_torch_cleaning_script(inputs)
         self.cp_dockerfile_if_exist(inputs)
         # cp torch wheels from root direct to vllm workspace if exist
         self.cp_torch_whls_if_exist(inputs)
@@ -205,6 +211,11 @@ def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
         copy(inputs.torch_whls_path, tmp_dir)
         return tmp_dir
 
+    def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):
+        script = get_path(inputs.cleaning_script, resolve=True)
+        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
+        copy(script, vllm_script)
+
     def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
         if not inputs.use_local_dockerfile:
             logger.info("using vllm default dockerfile.torch_nightly for build")
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
index 76401e33f29fd..224f078788702 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@@ -11,7 +11,7 @@
 
 from cli.lib.common.cli_helper import BaseRunner
 from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
-from cli.lib.common.path_helper import copy, remove_dir
+from cli.lib.common.path_helper import copy, get_path, remove_dir
 from cli.lib.common.pip_helper import (
     pip_install_first_match,
     pip_install_packages,
@@ -43,6 +43,10 @@ class VllmTestParameters:
 
     torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
 
+    cleaning_script: Path = env_path_field(
+        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
+    )
+
     def __post_init__(self):
         if not self.torch_whls_path.exists():
             raise ValueError("missing torch_whls_path")
@@ -92,11 +96,13 @@ def prepare(self):
         self._set_envs(params)
 
         clone_vllm(dst=self.work_directory)
+        self.cp_torch_cleaning_script(params)
         with working_directory(self.work_directory):
             remove_dir(Path("vllm"))
             self._install_wheels(params)
             self._install_dependencies()
         # verify the torches are not overridden by test dependencies
+
         check_versions()
 
     def run(self):
@@ -125,6 +131,11 @@ def run(self):
             # double check the torches are not overridden by other packages
             check_versions()
 
+    def cp_torch_cleaning_script(self, params: VllmTestParameters):
+        script = get_path(params.cleaning_script, resolve=True)
+        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
+        copy(script, vllm_script)
+
     def _install_wheels(self, params: VllmTestParameters):
         logger.info("Running vllm test with inputs: %s", params)
         if not pkg_exists("torch"):
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index edfff60744919..9c9d223777466 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -258,11 +258,19 @@ function install_torchrec_and_fbgemm() {
       git clone --recursive https://github.com/pytorch/fbgemm
       pushd fbgemm/fbgemm_gpu
       git checkout "${fbgemm_commit}" --recurse-submodules
-      python setup.py bdist_wheel \
-        --build-variant=rocm \
-        -DHIP_ROOT_DIR="${ROCM_PATH}" \
-        -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
-        -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
+      # until the fbgemm_commit includes the tbb patch
+      patch <<'EOF'
+--- a/FbgemmGpu.cmake
++++ b/FbgemmGpu.cmake
+@@ -184,5 +184,6 @@ gpu_cpp_library(
+     fbgemm_gpu_tbe_cache
+     fbgemm_gpu_tbe_optimizers
+     fbgemm_gpu_tbe_utils
++    tbb
+   DESTINATION
+     fbgemm_gpu)
+EOF
+      python setup.py bdist_wheel --build-variant=rocm
       popd
 
       # Save the wheel before cleaning up
diff --git a/.ci/pytorch/functorch_doc_push_script.sh b/.ci/pytorch/functorch_doc_push_script.sh
deleted file mode 100755
index 85c70dffa3966..0000000000000
--- a/.ci/pytorch/functorch_doc_push_script.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-# This is where the local pytorch install in the docker image is located
-pt_checkout="/var/lib/jenkins/workspace"
-source "$pt_checkout/.ci/pytorch/common_utils.sh"
-echo "functorch_doc_push_script.sh: Invoked with $*"
-
-set -ex -o pipefail
-
-version=${DOCS_VERSION:-nightly}
-echo "version: $version"
-
-# Build functorch docs
-pushd $pt_checkout/functorch/docs
-make html
-popd
-
-git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
-pushd functorch_ghpages
-
-if [ "$version" == "main" ]; then
-  version=nightly
-fi
-
-git rm -rf "$version" || true
-mv "$pt_checkout/functorch/docs/build/html" "$version"
-
-git add "$version" || true
-git status
-git config user.email "soumith+bot@pytorch.org"
-git config user.name "pytorchbot"
-# If there aren't changes, don't make a commit; push is no-op
-git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
-git status
-
-if [[ "${WITH_PUSH:-}" == true ]]; then
-  git push -u origin gh-pages
-fi
-
-popd
diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh
index 79d47da431712..c1505bd58cdde 100755
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@@ -59,7 +59,7 @@ test_python_shard() {
 
   setup_test_python
 
-  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
+  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"
 
   assert_git_not_dirty
 }
diff --git a/.ci/pytorch/numba-cuda-13.patch b/.ci/pytorch/numba-cuda-13.patch
new file mode 100644
index 0000000000000..f96ff287ed396
--- /dev/null
+++ b/.ci/pytorch/numba-cuda-13.patch
@@ -0,0 +1,25 @@
+From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
+From: Michael Wang <13521008+isVoid@users.noreply.github.com>
+Date: Tue, 1 Apr 2025 17:28:05 -0700
+Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
+ (#185)
+
+Co-authored-by: isVoid <isVoid@users.noreply.github.com>
+---
+ numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
+index 1641bf77..233e9ed7 100644
+--- a/numba_cuda/numba/cuda/cudadrv/driver.py
++++ b/numba_cuda/numba/cuda/cudadrv/driver.py
+@@ -365,6 +365,9 @@ def _find_api(self, fname):
+         else:
+             variants = ('_v2', '')
+ 
++        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
++            return getattr(self.lib, fname)
++
+         for variant in variants:
+             try:
+                 return getattr(self.lib, f'{fname}{variant}')
diff --git a/.ci/pytorch/smoke_test/smoke_test.py b/.ci/pytorch/smoke_test/smoke_test.py
index 305ad15d98e7e..675d58a3e283d 100644
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@@ -386,8 +386,8 @@ def foo(x: torch.Tensor) -> torch.Tensor:
 
 
 def smoke_test_nvshmem() -> None:
-    if not torch.cuda.is_available():
-        print("CUDA is not available, skipping NVSHMEM test")
+    if not torch.cuda.is_available() or target_os == "windows":
+        print("Windows platform or CUDA is not available, skipping NVSHMEM test")
         return
 
     # Check if NVSHMEM is compiled in current build
@@ -396,7 +396,9 @@ def smoke_test_nvshmem() -> None:
     except ImportError:
         # Not built with NVSHMEM support.
         # torch is not compiled with NVSHMEM prior to 2.9
-        if torch.__version__ < "2.9":
+        from torch.torch_version import TorchVersion
+
+        if TorchVersion(torch.__version__) < (2, 9):
             return
         else:
             # After 2.9: NVSHMEM is expected to be compiled in current build
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index e0d47259676b7..7267541483438 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -32,6 +32,16 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
   git config --global --add safe.directory /var/lib/jenkins/workspace
 fi
 
+
+# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
+NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
+if [ -n "$NUMBA_CUDA_DIR" ]; then
+  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
+  pushd "$NUMBA_CUDA_DIR"
+  patch -p4 <"$NUMBA_PATCH"
+  popd
+fi
+
 echo "Environment variables:"
 env
 
@@ -312,23 +322,29 @@ test_python_shard() {
 
   # modify LD_LIBRARY_PATH to ensure it has the conda env.
   # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
 
   assert_git_not_dirty
 }
 
 test_python() {
   # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
   assert_git_not_dirty
 }
 
 test_python_smoke() {
-  # Smoke tests for H100
+  # Smoke tests for H100/B200
   time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   assert_git_not_dirty
 }
 
+test_python_smoke_b200() {
+  # Targeted smoke tests for B200 - staged approach to avoid too many failures
+  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
 test_h100_distributed() {
   # Distributed tests at H100
   time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@@ -374,6 +390,7 @@ test_dynamo_wrapped_shard() {
     --exclude-distributed-tests \
     --exclude-torch-export-tests \
     --exclude-aot-dispatch-tests \
+    --exclude-quantization-tests \
     --shard "$1" "$NUM_TEST_SHARDS" \
     --verbose \
     --upload-artifacts-while-running
@@ -1146,6 +1163,12 @@ test_distributed() {
   fi
 }
 
+test_quantization() {
+  echo "Testing quantization"
+
+  python test/test_quantization.py
+}
+
 test_rpc() {
   echo "Testing RPC C++ tests"
   # NB: the ending test_rpc must match the current function name for the current
@@ -1540,14 +1563,10 @@ test_executorch() {
   install_torchvision
   install_torchaudio
 
-  pushd /executorch
-
-  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
 
-  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
-  # from the PR
-  bash .ci/scripts/setup-linux.sh --build-tool cmake
+  pushd /executorch
+  "${INSTALL_SCRIPT}" setup_executorch
 
   echo "Run ExecuTorch unit tests"
   pytest -v -n auto
@@ -1561,17 +1580,14 @@ test_executorch() {
 
   popd
 
-  # Test torchgen generated code for Executorch.
-  echo "Testing ExecuTorch op registration"
-  "$BUILD_BIN_DIR"/test_edge_op_registration
-
   assert_git_not_dirty
 }
 
 test_linux_aarch64() {
   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
+        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 
   # Dynamo tests
@@ -1646,6 +1662,8 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
   test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
   test_python_legacy_jit
+elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
+  test_quantization
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
@@ -1721,11 +1739,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
   install_torchvision
   test_inductor_shard "${SHARD_NUMBER}"
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
-      test_inductor_distributed
-    fi
-  fi
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
   test_einops
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
@@ -1775,6 +1788,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
   test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
   test_python_smoke
+elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
+  test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
   test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
index 19d715b9d0b6d..67d1569221924 100644
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -137,7 +137,7 @@ sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
   if "%BUILD_ENVIRONMENT%"=="" (
-    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
   ) else (
     copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"
 
diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
index 01e08c8bb4e5c..abd2c8722b11d 100644
--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
   set CONDA_PARENT_DIR=C:\Jenkins
 )
-
+set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
 
 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
-if not exist %CONDA_PARENT_DIR%\Miniconda3 (
+if not exist %CONDA_ROOT_DIR% (
   set INSTALL_FRESH_CONDA=1
 )
 
@@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 
-  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 )
 
 :: Activate conda so that we can use its commands, i.e. conda, python, pip
-call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
+call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
+:: Activate conda so that we can use its commands, i.e. conda, python, pip
+call conda activate py_tmp
+
+call pip install -r .ci/docker/requirements-ci.txt
diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
index 4a464d6b5786a..3173582b06f45 100644
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
-xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
 
 pushd .
 if "%VC_VERSION%" == "" (
diff --git a/.ci/pytorch/win-test-helpers/test_python_shard.bat b/.ci/pytorch/win-test-helpers/test_python_shard.bat
index d0fa3babe59d5..02829ee369757 100644
--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@@ -25,7 +25,7 @@ echo Copying over test times file
 robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
 
 echo Run nn tests
-python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
 if ERRORLEVEL 1 goto fail
 
 popd
diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh
index 43524dc04e3fb..c96d5c331c9f8 100755
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@@ -38,7 +38,14 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi
 
 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+
+# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
+# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
+# scipy from 1.6.3 to 1.10
+# expecttest from 0.1.3 to 0.3.0
+# xdoctest from 1.0.2 to 1.3.0
+python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
 
 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.15.1.0
@@ -52,9 +59,6 @@ python -m pip install parameterized==0.8.1
 # Install pulp for testing ilps under torch\distributed\_tools
 python -m pip install pulp==2.9.0
 
-# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
-python -m pip install expecttest==0.3.0
-
 run_tests() {
     # Run nvidia-smi if available
     for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh
index 763fce4b73e18..98b50c0ceeafe 100755
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@@ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 # Create an isolated directory to store this builds pytorch checkout and conda
 # installation
 if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then
-    MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)"
+    MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)"
 fi
 mkdir -p "$MAC_PACKAGE_WORK_DIR" || true
 if [[ -n ${GITHUB_ACTIONS} ]]; then
@@ -96,11 +96,11 @@ fi
 whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist"
 mkdir -p "$whl_tmp_dir"
 
-mac_version='macosx_11_0_arm64'
+mac_version='macosx-11_0-arm64'
 libtorch_arch='arm64'
 
 # Create a consistent wheel package name to rename the wheel to
-wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl"
+wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl"
 
 ###########################################################
 
@@ -125,7 +125,6 @@ popd
 export TH_BINARY_BUILD=1
 export INSTALL_TEST=0 # dont install test binaries into site-packages
 export MACOSX_DEPLOYMENT_TARGET=11.0
-export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
 EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
@@ -133,25 +132,19 @@ RENAME_WHEEL=true
 case $desired_python in
     3.14t)
         echo "Using 3.14 deps"
+        mac_version='macosx-11.0-arm64'
         NUMPY_PINNED_VERSION="==2.1.0"
-        CONDA_ENV_CREATE_FLAGS="python-freethreading"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
         RENAME_WHEEL=false
         ;;
     3.14)
         echo "Using 3.14t deps"
+        mac_version='macosx-11.0-arm64'
         NUMPY_PINNED_VERSION="==2.1.0"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
         RENAME_WHEEL=false
         ;;
     3.13t)
         echo "Using 3.13 deps"
         NUMPY_PINNED_VERSION="==2.1.0"
-        CONDA_ENV_CREATE_FLAGS="python-freethreading"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-        desired_python="3.13"
         RENAME_WHEEL=false
         ;;
     3.13)
@@ -176,17 +169,12 @@ case $desired_python in
         ;;
 esac
 
-# Install into a fresh env
-tmp_env_name="wheel_py$python_nodot"
-conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
-source activate "$tmp_env_name"
-
 PINNED_PACKAGES=(
     "numpy${NUMPY_PINNED_VERSION}"
 )
-retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
-pip install requests ninja typing-extensions
-retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
+python -mvenv ~/${desired_python}-build
+source ~/${desired_python}-build/bin/activate
+retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp
 
 # For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
@@ -200,7 +188,7 @@ export BUILD_TEST=OFF
 pushd "$pytorch_rootdir"
 echo "Calling setup.py bdist_wheel at $(date)"
 
-python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
+_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"
 
 echo "Finished setup.py bdist_wheel at $(date)"
 
diff --git a/.flake8 b/.flake8
index fc9ab167fbeef..fa73b7b880fd3 100644
--- a/.flake8
+++ b/.flake8
@@ -73,7 +73,7 @@ exclude =
     ./docs/src,
     ./functorch/docs,
     ./functorch/examples,
-    ./functorch/notebooks,
+    ./functorch/docs/source/tutorials,
     ./scripts,
     ./test/generated_type_hints_smoketest.py,
     ./third_party,
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index 798dee312306d..d4a7df9d5805b 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -21,6 +21,7 @@ self-hosted-runner:
     - linux.arm64.2xlarge.ephemeral
     - linux.arm64.m7g.4xlarge
     - linux.arm64.m7g.4xlarge.ephemeral
+    - linux.arm64.r7g.12xlarge.memory
     - linux.4xlarge.nvidia.gpu
     - linux.8xlarge.nvidia.gpu
     - linux.16xlarge.nvidia.gpu
diff --git a/.github/actions/reuse-old-whl/reuse_old_whl.py b/.github/actions/reuse-old-whl/reuse_old_whl.py
index def0276a9c8a3..48a8490985946 100644
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@@ -264,7 +264,7 @@ def change_content_to_new_version(file: Union[str, Path]) -> None:
         change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
 
         for file in Path(f"artifacts/dist/{old_stem}").glob(
-            "*.dist-info/**",
+            "*.dist-info/*",
         ):
             change_content_to_new_version(file)
 
diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
index 93c957896b5e8..37cec0c571538 100644
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@@ -6,6 +6,12 @@ inputs:
   cuda-version:
     description: which cuda version to install, 'cpu' for none
     required: true
+  python-version:
+    required: false
+    type: string
+    default: "3.10"
+    description: |
+      The python version to be used. Will be 3.10 by default
 
 runs:
   using: composite
@@ -38,18 +44,24 @@ runs:
         CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
 
         {
+          echo "CONDA=${CONDA}";
           echo "CONDA_RUN=${CONDA} run --no-capture-output";
           echo "CONDA_BUILD=${CONDA} run conda-build";
           echo "CONDA_INSTALL=${CONDA} install";
         } >> "${GITHUB_ENV}"
 
     - name: Setup Python3
+      env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
       shell: bash
       run: |
         set +e
         set -x
 
-        PYTHON3=$(${CONDA_RUN} which python3)
+        # Create new py_tmp env with python-version
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
+
+        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
         EXIT_CODE=$?
 
         if [[ "${EXIT_CODE}" == "0" ]]; then
@@ -62,7 +74,7 @@ runs:
           # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
           # is also the Miniconda installation that is Python 2 based, and both can be installed if
           # needed. In both cases, Python binary is just called python
-          PYTHON=$(${CONDA_RUN} which python)
+          PYTHON=$(${CONDA_RUN} -n py_tmp which python)
           EXIT_CODE=$?
 
           if [[ "${EXIT_CODE}" == "0" ]]; then
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index b0255e764c594..05e0b684b4278 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-27fc2493d383354a008106f22f3be232badee9a1
+87ff22e49ed0e92576c4935ccb8c143daac4a3cd
diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt
index c9c4265b2f37f..512b7c7da00e2 100644
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@@ -1 +1 @@
-e10fef08838612b4560e9c72e5cb1414a5edfa13
+1983609239caaab24ab1ed2bfa2aa92e8c76c1b1
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index eb335eb9d64d5..504d924ec7641 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-6c5478ff7c3d50dd1e3047d72ec5909bea474073
+c77852e117bdf056c8e9a087e51d6f65cf6ba53d
diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
index 2cee6ed2df19a..a1b68ad28210d 100644
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@@ -82,16 +82,10 @@ RUN if command -v apt-get >/dev/null; then \
         apt-get update -y \
         && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
     else \
-        dnf install -y git curl wget sudo vim; \
+        dnf install -y git curl wget sudo; \
     fi \
     && python3 --version && python3 -m pip --version
 
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
     if ! python3 -m uv --version >/dev/null 2>&1; then \
@@ -220,11 +214,16 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=.git,target=.git \
     if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        echo "Installing sccache..."; \
+        if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+            SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
+        else \
+            SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
+        fi; \
+        curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
         && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
         && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
         && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
         && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@@ -285,7 +284,7 @@ RUN if command -v apt-get >/dev/null; then \
         && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
         && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
     else \
-        dnf install -y git curl wget sudo vim; \
+        dnf install -y git curl wget sudo; \
     fi \
     && python3 --version && python3 -m pip --version
 
@@ -298,12 +297,6 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
     echo "[INFO] Showing torch_build_versions.txt content:" && \
     cat torch_build_versions.txt
 
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
     if ! python3 -m uv --version > /dev/null 2>&1; then \
diff --git a/.github/ci_configs/vllm/use_existing_torch.py b/.github/ci_configs/vllm/use_existing_torch.py
new file mode 100644
index 0000000000000..f55db97850d9c
--- /dev/null
+++ b/.github/ci_configs/vllm/use_existing_torch.py
@@ -0,0 +1,17 @@
+import glob
+
+
+requires_files = glob.glob("requirements/*.txt")
+requires_files += ["pyproject.toml"]
+for file in requires_files:
+    print(f">>> cleaning {file}")
+    with open(file) as f:
+        lines = f.readlines()
+    if "torch" in "".join(lines).lower():
+        print("removed:")
+        with open(file, "w") as f:
+            for line in lines:
+                if "torch" not in line.lower():
+                    f.write(line)
+    print(f"<<< done cleaning {file}")
+    print()
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 8b1acc77c267f..eb4076d81331d 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -130,3 +130,6 @@
 - torch/csrc/inductor/aoti_include/**
 - torchgen/aoti/**
 - torchgen/gen_aoti_c_shim.py
+
+"ciflow/vllm":
+- .github/ci_commit_pins/vllm.txt
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index a0aa6921b92ba..9f0937eb9f04b 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -19,6 +19,7 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
+- ciflow/quantization-periodic
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
@@ -36,6 +37,7 @@ ciflow_push_tags:
 - ciflow/win-arm64
 - ciflow/h100-symm-mem
 - ciflow/h100-cutlass-backend
+- ciflow/b200
 retryable_workflows:
 - pull
 - trunk
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
index 3a27cac46f71f..5fc26302a0add 100644
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@@ -15,7 +15,7 @@ optree==0.13.0
 packaging==23.1
 parameterized==0.8.1
 pillow==10.3.0
-protobuf==5.29.4
+protobuf==5.29.5
 psutil==5.9.8
 pygments==2.15.0
 pytest-cpp==2.3.0
@@ -26,7 +26,7 @@ pytest-xdist==3.3.1
 pytest==7.3.2
 pyyaml==6.0.2
 scipy==1.12.0
-setuptools==72.1.0
+setuptools==78.1.1
 sympy==1.13.3
 tlparse==0.4.0
 tensorboard==2.13.0
diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py
index ccd2eb0f4bd0f..04f4707a55c3f 100644
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@@ -39,7 +39,9 @@ def main() -> None:
     pull_request_label_names = [label.name for label in pull_request_labels]
     issue_label_names = [label.name for label in issue_labels]
     labels_to_add = [
-        label for label in issue_label_names if label not in pull_request_label_names
+        label
+        for label in issue_label_names
+        if label not in pull_request_label_names and label != "actionable"
     ]
     if not labels_to_add:
         print("The pull request already has the same labels.")
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index 4a4f8a65f684d..e57c2d5ef0749 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -43,55 +43,55 @@
 
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
     "12.6": (
-        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
     ),
     "12.8": (
-        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
     ),
     "13.0": (
-        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
+        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
+        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
     ),
     "xpu": (
         "intel-cmplr-lib-rt==2025.2.1 | "
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 67906d4ad88d5..0396c405ad0a7 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -135,7 +135,7 @@ class OperatingSystem:
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(
             OperatingSystem.LINUX,
             arches=["6.4"],
-            python_versions=["3.9"],
+            python_versions=["3.10"],
         ),
         ciflow_config=CIFlowConfig(
             labels={
diff --git a/.github/scripts/prepare_vllm_wheels.sh b/.github/scripts/prepare_vllm_wheels.sh
new file mode 100755
index 0000000000000..62362c7ff207c
--- /dev/null
+++ b/.github/scripts/prepare_vllm_wheels.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+
+set -eux
+
+torch_version=$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+nightly=$(echo ${torch_version} | cut -d'.' -f4)
+
+# Copied from .ci/manywheel/build_common.sh
+make_wheel_record() {
+  fpath=$1
+  if echo $fpath | grep RECORD >/dev/null 2>&1; then
+    echo "$fpath,,"
+  else
+    fhash=$(openssl dgst -sha256 -binary $fpath | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
+    fsize=$(ls -nl $fpath | awk '{print $5}')
+    echo "$fpath,sha256=$fhash,$fsize"
+  fi
+}
+
+change_wheel_version() {
+  local package=$1
+  local wheel=$2
+  local f_version=$3
+  local t_version=$4
+
+  # Extract the wheel
+  ${PYTHON_EXECUTABLE} -mwheel unpack $wheel
+
+  mv "${package}-${f_version}" "${package}-${t_version}"
+  # Change the version from f_version to t_version in the dist-info dir
+  pushd "${package}-${t_version}"
+  mv "${package}-${f_version}.dist-info" "${package}-${t_version}.dist-info"
+
+  pushd "${package}-${t_version}.dist-info"
+  sed -i "s/${package}-${f_version}.dist-info/${package}-${t_version}.dist-info/g" RECORD
+
+  # Update the version in METADATA and its SHA256 hash
+  sed -i "s/Version: ${f_version}/Version: ${t_version}/g" METADATA
+  # then add PyTorch nightly dependency of vLLM
+  if [[ "${package}" == vllm ]] || [[ "${package}" == xformers ]]; then
+    sed -i "/License-File/a\Requires-Dist: torch==${torch_version}" METADATA
+  fi
+  sed -i '/METADATA,sha256/d' RECORD
+  popd
+
+  make_wheel_record "${package}-${t_version}.dist-info/METADATA" >> "${package}-${t_version}.dist-info/RECORD"
+  popd
+
+  # Repack the wheel
+  ${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
+
+  # Clean up
+  rm -rf "${package}-${t_version}"
+}
+
+repackage_wheel() {
+  local package=$1
+  pushd $package
+
+  local orig_wheel=$(find . -name *${package//-/_}*)
+  local orig_version=$(unzip -p $orig_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+
+  local version=""
+  if [[ "${package}" == vllm ]]; then
+    # Copied from vllm/.buildkite/scripts/upload-wheels.sh
+    version=1.0.0
+  else
+    version=$(echo $orig_version | tr '.+' '.' | cut -d'.' -f1-3)
+  fi
+  local nightly_version=$version.$nightly
+
+  # Use nightly version
+  change_wheel_version ${package//-/_} $orig_wheel $orig_version $nightly_version
+  # Clean up
+  rm "${orig_wheel}"
+
+  auditwheel repair --plat $PLATFORM *.whl \
+    --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
+  local repair_wheel=$(find wheelhouse -name *${PLATFORM}*)
+  local repair_wheel=$(basename ${repair_wheel})
+  popd
+
+  cp ${package}/wheelhouse/${repair_wheel} .
+  rm -rf $package
+}
+
+# Require to re-package the wheel
+${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
+
+pushd externals/vllm/wheels
+for package in xformers flashinfer-python vllm; do
+  repackage_wheel $package
+done
+popd
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index fee9ca2eac120..a0f8befddf39e 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -71,7 +71,7 @@ jobs:
     with:!{{ upload.binary_env_as_input(config) }}
       {%- if "aarch64" in build_environment %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       {%- elif "s390x" in build_environment %}
       runs_on: linux.s390x
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index f4b2a66d2acda..7f307447c3576 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -22,6 +22,16 @@ name: !{{ build_environment }}
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
 {%- endmacro %}
 
+{%- macro setup_python(py_ver) -%}
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
+          freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
+{%- endmacro %}
+
 on:
 # TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
   push:
@@ -61,23 +71,13 @@ jobs:
     {%- endif %}
     steps:
       !{{ set_runner_specific_vars() }}
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      !{{ setup_python(config.get("python_version", "3.10")) }}
       !{{ common.checkout(deep_clone=False, directory="pytorch") }}
       - name: Populate binary env
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -94,8 +94,6 @@ jobs:
 {%- if config["package_type"] == "wheel" %}
       - name: Test PyTorch wheel
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -106,33 +104,9 @@ jobs:
 
           SMOKE_TEST_PARAMS=""
 
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
           # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index 2d9e4d0e27b25..476dd182db0f8 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -187,8 +187,6 @@ jobs:
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        with:
-          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
 
       - name: configure aws credentials
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index ff5dbe604bac1..aba3fa3dceec2 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -75,10 +75,6 @@ jobs:
             runner: ${{ inputs.runner_prefix }}linux.2xlarge
             # It takes less than 30m to finish python docs unless there are issues
             timeout-minutes: 30
-          - docs_type: functorch
-            runner: ${{ inputs.runner_prefix }}linux.2xlarge
-            # It takes less than 15m to finish functorch docs unless there are issues
-            timeout-minutes: 15
     # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
     # The current name requires updating the database last docs push query from test-infra every time the matrix is updated
     name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
@@ -211,16 +207,6 @@ jobs:
           path: cppdocs/
           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs
 
-      - name: Upload functorch Docs Preview
-        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
-        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
-        with:
-          retention-days: 14
-          s3-bucket: doc-previews
-          if-no-files-found: error
-          path: functorch_ghpages/nightly/
-          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
-
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()
diff --git a/.github/workflows/_get-changed-files.yml b/.github/workflows/_get-changed-files.yml
index 55712b0652702..311c594a11eff 100644
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@@ -2,6 +2,12 @@ name: Get Changed Files
 
 on:
   workflow_call:
+    inputs:
+      all_files:
+        description: "Whether to return all files instead of just changed files"
+        required: false
+        type: boolean
+        default: false
     outputs:
       changed-files:
         description: "List of changed files (space-separated) or '*' if not in a PR"
@@ -26,17 +32,23 @@ jobs:
             # Get the PR number from the github context
             PR_NUMBER="${{ github.event.number }}"
 
-            # Use gh CLI to get changed files in the PR with explicit repo
-            CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
-
-            if [ -z "$CHANGED_FILES" ]; then
-              echo "No changed files found, setting to '*'"
-              CHANGED_FILES="*"
+            # Check if all_files is requested
+            if [ "${{ inputs.all_files }}" = "true" ]; then
+              echo "all_files input is true, returning all files"
+              echo "changed-files=*" >> "$GITHUB_OUTPUT"
+            else
+              # Use gh CLI to get changed files in the PR with explicit repo
+              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
+
+              if [ -z "$CHANGED_FILES" ]; then
+                echo "No changed files found, setting to '*'"
+                CHANGED_FILES="*"
+              fi
+
+              echo "Changed files: $CHANGED_FILES"
+              echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
             fi
 
-            echo "Changed files: $CHANGED_FILES"
-            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
-
           else
             echo "Not in PR context, setting changed files to '*'"
             echo "changed-files=*" >> "$GITHUB_OUTPUT"
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 66579b573a63d..537e94488b363 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -169,7 +169,7 @@ jobs:
         id: install-nvidia-driver
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
         with:
-          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
+          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
 
       - name: Setup GPU_FLAG for docker run
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index f73972942b5f9..7781e1f65fd16 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -62,6 +62,11 @@ on:
         required: false
         type: number
         default: 1
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
 
@@ -76,10 +81,9 @@ jobs:
     strategy:
       matrix: ${{ fromJSON(inputs.test-matrix) }}
       fail-fast: false
-    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
     runs-on: ${{ matrix.runner }}
+    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
     steps:
-      # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
         with:
@@ -131,6 +135,9 @@ jobs:
 
       - name: Start monitoring script
         id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
+        shell: bash
+        continue-on-error: true
         env:
           JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
@@ -138,9 +145,6 @@ jobs:
           WORKFLOW_RUN_ID: ${{github.run_id}}
           MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
           MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
-        if: ${{ !inputs.disable-monitor }}
-        shell: bash
-        continue-on-error: true
         run: |
           python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
           python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
@@ -178,6 +182,12 @@ jobs:
         run: |
           echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
 
+      - name: Preserve github env variables for use in docker
+        shell: bash
+        run: |
+          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+
       - name: Test
         id: test
         env:
@@ -193,20 +203,22 @@ jobs:
           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
           BRANCH: ${{ steps.parse-ref.outputs.branch }}
           SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
           TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
           NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
           NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
-          TEST_CONFIG: ${{ matrix.config }}
-          SHARD_NUMBER: ${{ matrix.shard }}
-          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
           TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
           DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
         timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
         run: |
           set -x
@@ -236,6 +248,7 @@ jobs:
             -e GITHUB_RUN_ATTEMPT \
             -e JOB_ID \
             -e JOB_NAME \
+            -e BASE_SHA \
             -e BRANCH \
             -e SHA1 \
             -e AWS_DEFAULT_REGION \
@@ -253,10 +266,12 @@ jobs:
             -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
             -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             -e TESTS_TO_INCLUDE \
+            -e HUGGING_FACE_HUB_TOKEN \
             -e DASHBOARD_TAG \
             --env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
             --ulimit stack=10485760:83886080 \
             --ulimit core=0 \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
             --shm-size="8g" \
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index 7067d79eb0758..d447dba4a511c 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -151,7 +151,7 @@ jobs:
           BUILD_WHEEL: 1
           MAX_JOBS: 8
           CUDA_VERSION: ${{ inputs.cuda-version }}
-          PYTHON_VERSION: "3.9"
+          PYTHON_VERSION: "3.10"
           SCCACHE_BUCKET: "ossci-compiler-cache"
           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
           SCCACHE_REGION: us-east-1
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 5049ef61f6930..a93f10c123aac 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -184,7 +184,7 @@ jobs:
         env:
           USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
           INSTALL_WINDOWS_SDK: 1
-          PYTHON_VERSION: 3.9
+          PYTHON_VERSION: "3.10"
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
           TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index 932d9c8863027..b9ccc6fc361a7 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -50,7 +50,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
         device: ["cuda", "rocm", "xpu", "aarch64"]
         docker-image: ["pytorch/manylinux2_28-builder:cpu"]
         include:
@@ -108,9 +108,6 @@ jobs:
 
           # Determine python executable for given version
           case $PY_VERS in
-          3.9)
-            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
-            ;;
           3.10)
             PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
             ;;
@@ -194,7 +191,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
         device: ["xpu"]
     timeout-minutes: 40
     env:
diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml
index 658e02ede6fbd..2c66353748417 100644
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@@ -12,6 +12,9 @@ on:
     paths:
       - .github/workflows/build-vllm-wheel.yml
       - .github/ci_commit_pins/vllm.txt
+  schedule:
+    # every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
+    - cron: 30 13 * * *
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -24,21 +27,33 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [ '3.12' ]
-        # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
+        # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
+        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
         device: [ 'cu128', 'cu129' ]
-        runner: [ 'linux.12xlarge.memory' ]
         include:
-          - device: cu128
+          - platform: manylinux_2_28_x86_64
+            device: cu128
             manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
-          - device: cu129
+            runner: linux.12xlarge.memory
+          - platform: manylinux_2_28_x86_64
+            device: cu129
             manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
-    name: "Build ${{ matrix.device }} vLLM wheel"
+            runner: linux.12xlarge.memory
+          - platform: manylinux_2_28_aarch64
+            device: cu128
+            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
+            runner: linux.arm64.r7g.12xlarge.memory
+          - platform: manylinux_2_28_aarch64
+            device: cu129
+            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
+            runner: linux.arm64.r7g.12xlarge.memory
+    name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 480
     env:
       PY_VERS: ${{ matrix.python-version }}
       MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
-      PLATFORM: 'manylinux_2_28_x86_64'
+      PLATFORM: ${{ matrix.platform }}
       BUILD_DEVICE: ${{ matrix.device }}
     steps:
       - name: Setup SSH (Click me for login details)
@@ -59,20 +74,6 @@ jobs:
         run: |
           set -eux
 
-          # Keep PyTorch nightly wheel here so that we can install it later during
-          # vLLM build process
-          mkdir -p "${RUNNER_TEMP}/artifacts/"
-
-          container_name=$(docker run \
-            --tty \
-            --detach \
-            -e PLATFORM \
-            -v "${GITHUB_WORKSPACE}:/pytorch" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w /artifacts/ \
-            "${MANYLINUX_IMAGE}"
-          )
-
           # Determine python executable for given version (copied from build-triton-wheel)
           case $PY_VERS in
           3.10)
@@ -102,6 +103,21 @@ jobs:
             ;;
           esac
 
+          # Keep PyTorch nightly wheel here so that we can install it later during
+          # vLLM build process
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+
+          container_name=$(docker run \
+            --tty \
+            --detach \
+            -e PLATFORM \
+            -e PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
+            -v "${GITHUB_WORKSPACE}:/pytorch" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w /artifacts/ \
+            "${MANYLINUX_IMAGE}"
+          )
+
           docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
             --pre torch torchvision torchaudio \
             --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
@@ -113,7 +129,6 @@ jobs:
             --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
 
           # Save this for later
-          echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
           echo "container_name=${container_name}" >> "$GITHUB_ENV"
 
       - name: Build vLLM wheel
@@ -131,41 +146,12 @@ jobs:
           set -eux
 
           # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
-          docker exec -t "${container_name}" bash -c "
-            set -eux
-
-            nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
-
-            pushd externals/vllm/wheels
-            for package in xformers flashinfer-python vllm; do
-              pushd \$package
-              auditwheel repair --plat \$PLATFORM *.whl \
-                --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
-              repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
-              repair_wheel=\$(basename \${repair_wheel})
-              popd
-
-              cp \${package}/wheelhouse/\${repair_wheel} .
-              version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-
-              if [[ \$package == vllm ]]; then
-                new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
-              else
-                major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
-                new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
-              fi
-
-              mv -- \$repair_wheel \$new_wheel
-              rm -rf \$package
-            done
-            popd
-          "
-
+          docker exec -t "${container_name}" bash -c /pytorch/.github/scripts/prepare_vllm_wheels.sh
           docker exec -t "${container_name}" chown -R 1000:1000 /artifacts
 
       - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
-          name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
+          name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
           if-no-files-found: error
           path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
 
@@ -175,27 +161,29 @@ jobs:
 
   # Copied from build-triton-wheel workflow (mostly)
   upload-wheel:
-    name: "Upload ${{ matrix.device }} vLLM wheel"
+    name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
     needs:
       - build-wheel
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
+        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
         device: [ 'cu128', 'cu129' ]
     env:
+      PLATFORM: ${{ matrix.platform }}
       BUILD_DEVICE: ${{ matrix.device }}
     permissions:
       id-token: write
       contents: read
     container:
       image: continuumio/miniconda3:4.12.0
-    environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
+    environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }}
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: Configure AWS credentials(PyTorch account) for main
-        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
         with:
           role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
@@ -219,15 +207,15 @@ jobs:
         run: |
           set -eux
           mkdir -p "${RUNNER_TEMP}/artifacts/"
-          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
 
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
+      - name: Set DRY_RUN
+        if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
         shell: bash
         run: |
           echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
 
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+      - name: Set UPLOAD_CHANNEL
         if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
         shell: bash
         run: |
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 492f41775d9de..272a2d1c691db 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -70,9 +70,8 @@ jobs:
           pytorch-linux-jammy-py3-clang18-asan,
           pytorch-linux-jammy-py3-clang12-onnx,
           pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
-          # Executorch pin needs update
-          # pytorch-linux-jammy-py3-clang12-executorch,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
+          pytorch-linux-jammy-py3-clang12-executorch,
           pytorch-linux-jammy-py3.12-triton-cpu,
           pytorch-linux-noble-riscv64-py3.12-gcc14
         ]
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index 860ee21cda6a7..651b034b2edc1 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -62,7 +62,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -128,11 +128,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -174,11 +174,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -220,11 +220,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -265,7 +265,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -331,11 +331,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -377,11 +377,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -423,11 +423,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -468,7 +468,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -534,11 +534,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -580,11 +580,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -626,11 +626,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -671,7 +671,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -737,11 +737,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -783,11 +783,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -829,11 +829,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -874,7 +874,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -940,11 +940,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -986,11 +986,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1032,11 +1032,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1077,7 +1077,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -1143,11 +1143,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1189,11 +1189,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1235,11 +1235,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1280,7 +1280,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -1346,11 +1346,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1392,11 +1392,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1438,11 +1438,11 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml
index ec08b2c78eb67..96b9f9f739f72 100644
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@@ -60,7 +60,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-test:  # Testing
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 8a581a1f21fe1..0f87f97df694d 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -127,7 +127,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-test:  # Testing
@@ -193,7 +193,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-test:  # Testing
@@ -259,7 +259,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda13_0-test:  # Testing
@@ -719,7 +719,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-test:  # Testing
@@ -785,7 +785,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-test:  # Testing
@@ -851,7 +851,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda13_0-test:  # Testing
@@ -1311,7 +1311,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-test:  # Testing
@@ -1377,7 +1377,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-test:  # Testing
@@ -1443,7 +1443,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda13_0-test:  # Testing
@@ -1903,7 +1903,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-test:  # Testing
@@ -1969,7 +1969,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-test:  # Testing
@@ -2035,7 +2035,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda13_0-test:  # Testing
@@ -2495,7 +2495,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-test:  # Testing
@@ -2561,7 +2561,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-test:  # Testing
@@ -2627,7 +2627,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda13_0-test:  # Testing
@@ -3087,7 +3087,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_6-test:  # Testing
@@ -3153,7 +3153,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_8-test:  # Testing
@@ -3219,7 +3219,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda13_0-test:  # Testing
@@ -3679,7 +3679,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_6-test:  # Testing
@@ -3745,7 +3745,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_8-test:  # Testing
@@ -3811,7 +3811,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda13_0-test:  # Testing
diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
index 8177bac3fe216..18706347026ba 100644
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@@ -44,7 +44,7 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-rocm6_4-build:
+  manywheel-py3_10-rocm6_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -58,16 +58,16 @@ jobs:
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-rocm6_4
+      build_name: manywheel-py3_10-rocm6_4
       build_environment: linux-binary-manywheel-rocm
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm6_4-test:  # Testing
+  manywheel-py3_10-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_9-rocm6_4-build
+      - manywheel-py3_10-rocm6_4-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -82,14 +82,14 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_9-rocm6_4
+          name: manywheel-py3_10-rocm6_4
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
index 500f8fa07af6b..cd912650eb17d 100644
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@@ -60,13 +60,13 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.10.4"
+          freethreaded: false
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -81,13 +81,9 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index 6aee57b503aa2..8522d2d369930 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -56,13 +56,13 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.10.4"
+          freethreaded: false
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -77,13 +77,9 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -99,8 +95,6 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -111,33 +105,9 @@ jobs:
 
           SMOKE_TEST_PARAMS=""
 
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
           # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -196,13 +166,13 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.11.4"
+          freethreaded: false
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -217,13 +187,9 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -239,8 +205,6 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -251,33 +215,9 @@ jobs:
 
           SMOKE_TEST_PARAMS=""
 
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
           # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -336,13 +276,13 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.12.4"
+          freethreaded: false
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -357,13 +297,9 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -379,8 +315,6 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -391,33 +325,9 @@ jobs:
 
           SMOKE_TEST_PARAMS=""
 
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
           # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -476,13 +386,13 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.13.4"
+          freethreaded: false
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -497,13 +407,9 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -519,8 +425,6 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -531,33 +435,9 @@ jobs:
 
           SMOKE_TEST_PARAMS=""
 
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
           # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -616,13 +496,13 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.13.4"
+          freethreaded: true
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -637,13 +517,9 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -659,8 +535,6 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -671,33 +545,9 @@ jobs:
 
           SMOKE_TEST_PARAMS=""
 
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
           # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -756,13 +606,13 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.14.0-rc.2"
+          freethreaded: false
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -777,13 +627,9 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -799,8 +645,6 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -811,33 +655,9 @@ jobs:
 
           SMOKE_TEST_PARAMS=""
 
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
           # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -896,13 +716,13 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          # TODO: Removeme once 3.14 is out
+          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
+          python-version: "3.14.0-rc.2"
+          freethreaded: true
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -917,13 +737,9 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -939,8 +755,6 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -951,33 +765,9 @@ jobs:
 
           SMOKE_TEST_PARAMS=""
 
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
           # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
+          python -mvenv test_venv
+          source test_venv/bin/activate
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml
index fe0f102406b6a..78602e05586b7 100644
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@@ -37,7 +37,7 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
@@ -56,7 +56,7 @@ jobs:
     uses: ./.github/workflows/_linux-test.yml
     needs: nightly-dynamo-benchmarks-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
       test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
       timeout-minutes: 720
diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
index 170de752ab875..a7110b0fd9328 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@@ -43,6 +43,11 @@ on:
         required: false
         type: boolean
         default: false
+      freezing:
+        description: Run freezing?
+        required: false
+        type: boolean
+        default: true
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -75,7 +80,7 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
@@ -101,8 +106,8 @@ jobs:
     needs: inductor-build
     if: github.event.schedule == '0 7 * * *'
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
+      build-environment: linux-jammy-py3.10-gcc11-build
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
       docker-image: ${{ needs.inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
@@ -116,10 +121,9 @@ jobs:
     name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
     needs: inductor-build
-    if: github.event_name == 'workflow_dispatch'
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
+      build-environment: linux-jammy-py3.10-gcc11-build
+      dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
       docker-image: ${{ needs.inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml
index f894b8fdc6e03..0533184df2e0e 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@@ -80,7 +80,7 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
@@ -107,7 +107,7 @@ jobs:
     needs: inductor-build
     if: github.event.schedule == '0 7 * * *'
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
       docker-image: ${{ needs.inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@@ -124,7 +124,7 @@ jobs:
     needs: inductor-build
     if: github.event_name == 'workflow_dispatch'
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
       docker-image: ${{ needs.inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index 21d965eaeaada..454cd166c90bb 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -39,7 +39,7 @@ jobs:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
+      cuda-arch-list: '8.0;8.6'
       test-matrix: |
         { include: [
           { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@@ -62,7 +62,7 @@ jobs:
           { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
           { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@@ -154,7 +154,7 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
@@ -200,7 +200,7 @@ jobs:
     uses: ./.github/workflows/_linux-test.yml
     needs: periodic-dynamo-benchmarks-cpu-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml
index 2125a8559363b..6ab276a57fc4d 100644
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@@ -110,7 +110,7 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
@@ -127,7 +127,7 @@ jobs:
     uses: ./.github/workflows/_linux-test.yml
     needs: inductor-cpu-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 4189d24a7b14f..2616141c0dc2a 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -79,7 +79,7 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
@@ -101,7 +101,7 @@ jobs:
     uses: ./.github/workflows/_linux-test.yml
     needs: inductor-cpu-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index b1a6dfb390711..80f78b01c9808 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -31,6 +31,8 @@ jobs:
     if: github.repository_owner == 'pytorch'
     name: Get changed files
     uses: ./.github/workflows/_get-changed-files.yml
+    with:
+      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
 
   lintrunner-clang:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -53,7 +55,7 @@ jobs:
     with:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
       # to run git rev-parse HEAD~:.ci/docker when a new image is needed
       fetch-depth: 0
@@ -264,10 +266,10 @@ jobs:
         with:
           submodules: false
           fetch-depth: 1
-      - name: Setup Python 3.9
+      - name: Setup Python 3.10
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.9'
+          python-version: '3.10'
           architecture: x64
           cache: pip
       - name: Install dependencies
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 65b8781be7585..696c5b68b475b 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -54,7 +54,7 @@ jobs:
       - get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
       docker-image: ${{ needs.docs-build.outputs.docker-image }}
       push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
       run-doxygen: true
diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml
index aaf32c160f0dc..dcdc2cd0ba24e 100644
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@@ -14,6 +14,10 @@ on:
   schedule:
     # Run at 07:00 UTC every Sunday
     - cron: 0 7 * * 0
+  pull_request:
+    paths:
+      - benchmarks/operator_benchmark/**
+      - .github/workflows/operator_benchmark.yml
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -29,7 +33,7 @@ jobs:
     name: opbenchmark-build
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
@@ -42,7 +46,7 @@ jobs:
     name: opbenchmark-on-demand-build
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
@@ -55,7 +59,7 @@ jobs:
     uses: ./.github/workflows/_linux-test.yml
     needs: opbenchmark-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
       test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3f13fbf276882..e0e1065c5aba0 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -316,32 +316,6 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3-clang12-executorch-build:
-    if: false  # Docker build needs pin update
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
-      test-matrix: |
-        { include: [
-          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3-clang12-executorch-test:
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3-clang12-executorch-build
-    if: false # Has been broken for a while
-    with:
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
     name: cuda12.8-py3.10-gcc9-sm75
     uses: ./.github/workflows/_linux-build.yml
diff --git a/.github/workflows/quantization-periodic.yml b/.github/workflows/quantization-periodic.yml
new file mode 100644
index 0000000000000..688f557eaf0e4
--- /dev/null
+++ b/.github/workflows/quantization-periodic.yml
@@ -0,0 +1,54 @@
+name: quantization-periodic
+
+on:
+  push:
+    tags:
+      - ciflow/quantization-periodic/*
+  workflow_dispatch:
+  schedule:
+    # run weekly
+    - cron: "45 0 * * 0"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-default-label-prefix:
+    name: get-default-label-prefix
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  periodic-quantization-build:
+    name: periodic-quantization-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.9'
+      test-matrix: |
+        { include: [
+          { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+  periodic-test-quantization:
+    name: periodic-test-quantization
+    uses: ./.github/workflows/_linux-test.yml
+    needs: periodic-quantization-build
+    with:
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/test-b200.yml b/.github/workflows/test-b200.yml
new file mode 100644
index 0000000000000..ef7f75bc4b2b4
--- /dev/null
+++ b/.github/workflows/test-b200.yml
@@ -0,0 +1,76 @@
+# B200 Smoke Tests CI Workflow
+#
+# This workflow runs smoke tests on B200 hardware
+#
+# Flow:
+# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
+# 2. Runs smoke tests on linux.dgx.b200 runner
+# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
+#
+# Triggered by:
+# - Pull requests modifying this workflow file
+# - Manual dispatch
+# - Schedule (every 6 hours)
+# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
+
+name: B200 Smoke Tests
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/test-b200.yml
+  workflow_dispatch:
+  schedule:
+    - cron: 0 4,10,16,22 * * *  # every 6 hours
+  push:
+    tags:
+      - ciflow/b200/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+      # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 4dd465d70803d..0140c2d3c00cb 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -240,7 +240,7 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
@@ -255,7 +255,31 @@ jobs:
       - verify-cachebench-cpu-build
       - target-determination
     with:
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
       docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
     secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-build:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+      test-matrix: |
+        { include: [
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-test:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3-clang12-executorch-build
+    with:
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml
index 7f0fe6058bd08..b5955127d9fb3 100644
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@@ -53,27 +53,3 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index 654e88be386b6..b2768a8f767e2 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -36,6 +36,8 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
+      # When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
+      allow-reuse-old-whl: false
       build-additional-packages: "vision audio"
       build-external-packages: "vllm"
       build-environment: linux-jammy-cuda12.8-py3.12-gcc11
diff --git a/.gitignore b/.gitignore
index d1fa4cd3caf28..ca87f1306e125 100644
--- a/.gitignore
+++ b/.gitignore
@@ -259,6 +259,9 @@ gen
 .pytest_cache
 aten/build/*
 
+# Linker scripts for prioritized text optimization
+cmake/linker_script.ld
+
 # Bram
 plsdontbreak
 
@@ -389,3 +392,5 @@ android/pytorch_android_torchvision/.cxx
 
 # Claude Code local configuration
 CLAUDE.local.md
+/test_*.py
+/debug_*.py
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 944829fa38977..679a04981b07a 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -13,7 +13,7 @@ exclude_patterns = [
     '**/fb/**',
     'functorch/docs/**',
     'functorch/examples/**',
-    'functorch/notebooks/**',
+    'functorch/docs/source/tutorials/**',
     'torch/_inductor/fx_passes/serialized_patterns/**',
     'torch/_inductor/autoheuristic/artifacts/**',
     'scripts/**',
@@ -49,7 +49,7 @@ init_command = [
     'mccabe==0.7.0',
     'pycodestyle==2.14.0',
     'pyflakes==3.4.0',
-    'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
+    'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"',
 ]
 
 
@@ -123,6 +123,7 @@ is_formatter = true
 code = 'MYPY'
 include_patterns = [
     'setup.py',
+    'functorch/dim/**/*.py',
     'torch/**/*.py',
     'torch/**/*.pyi',
     'caffe2/**/*.py',
@@ -152,7 +153,7 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
+    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
     'numpy==2.1.0 ; python_version >= "3.12"',
     'expecttest==0.3.0',
     'mypy==1.16.0',
@@ -195,6 +196,7 @@ exclude_patterns = [
     'tools/test/gen_operators_yaml_test.py',
     'tools/test/gen_oplist_test.py',
     'tools/test/test_selective_build.py',
+    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
     'python3',
@@ -964,7 +966,6 @@ exclude_patterns = [
     'test/jit/**',  # should be run through test/test_jit.py
     'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
     'test/fx/**',  # should be run through test/test_fx.py
-    'test/bottleneck_test/**',  # excluded by test/run_test.py
     'test/package/**',  # excluded by test/run_test.py
     'test/distributed/argparse_util_test.py',
     'test/distributed/bin/test_script.py',
@@ -1410,8 +1411,6 @@ exclude_patterns = [
     'torch/utils/benchmark/utils/timer.py',
     'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
     'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
-    'torch/utils/bottleneck/__init__.py',
-    'torch/utils/bottleneck/__main__.py',
     'torch/utils/bundled_inputs.py',
     'torch/utils/checkpoint.py',
     'torch/utils/collect_env.py',
@@ -1568,7 +1567,6 @@ include_patterns = [
 exclude_patterns = [
     'caffe2/**',
     'functorch/docs/**',
-    'functorch/notebooks/**',
     'torch/_inductor/fx_passes/serialized_patterns/**',
     'torch/_inductor/autoheuristic/artifacts/**',
     'test/dynamo/cpython/**',
diff --git a/BUILD.bazel b/BUILD.bazel
index 2cbd36f06761b..5d7625b402947 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -810,7 +810,7 @@ cc_library(
     name = "torch_python",
     srcs = libtorch_python_core_sources
         + if_cuda(libtorch_python_cuda_sources)
-        + if_cuda(libtorch_python_distributed_sources)
+        + libtorch_python_distributed_sources
         + GENERATED_AUTOGRAD_PYTHON,
     hdrs = glob([
         "torch/csrc/generic/*.cpp",
@@ -832,36 +832,6 @@ pybind_extension(
     ],
 )
 
-cc_library(
-    name = "functorch",
-    hdrs = glob([
-        "functorch/csrc/dim/*.h",
-    ]),
-    srcs = glob([
-        "functorch/csrc/dim/*.cpp",
-    ]),
-    deps = [
-        ":aten_nvrtc",
-        ":torch_python",
-        "@pybind11",
-    ],
-)
-
-pybind_extension(
-    name = "functorch/_C",
-    copts=[
-        "-DTORCH_EXTENSION_NAME=_C"
-    ],
-    srcs = [
-        "functorch/csrc/init_dim_only.cpp",
-    ],
-    deps = [
-        ":functorch",
-        ":torch_python",
-        ":aten_nvrtc",
-    ],
-)
-
 cc_binary(
     name = "torch/bin/torch_shm_manager",
     srcs = [
@@ -902,7 +872,6 @@ py_library(
     ],
     data = [
         ":torch/_C.so",
-        ":functorch/_C.so",
         ":torch/bin/torch_shm_manager",
     ],
 )
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21c867dd6b6e6..eb973f33fb8f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,4 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
-# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
 
 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@@ -380,6 +379,13 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                        OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                         "CPU_AARCH64" OFF)
+# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
+set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
+if(LINUX AND CPU_AARCH64)
+  set(USE_PRIORITIZED_TEXT_DEFAULT ON)
+endif()
+cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
+  "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
 
 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
@@ -657,6 +663,11 @@ endif(MSVC)
 
 string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
 
+# Set linker max-page-size to 64KiB on AArch64 Linux
+if(LINUX AND CPU_AARCH64)
+  add_link_options_if_supported("-z,max-page-size=0x10000")
+endif()
+
 # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
 # applicable to mobile are disabled by this variable. Setting
 # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@@ -874,7 +885,7 @@ cmake_dependent_option(
   "Whether to build the flash_attention kernel for scaled dot product attention.\
   Will be disabled if not supported by the platform"
   ON
-  "USE_CUDA OR USE_ROCM;NOT MSVC"
+  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
   OFF)
 
 cmake_dependent_option(
@@ -885,6 +896,17 @@ cmake_dependent_option(
   "USE_CUDA OR USE_ROCM"
   OFF)
 
+IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+  message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
+  set(USE_FBGEMM_GENAI off)
+endif()
+
+# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
+if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
+  message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
+  set(USE_FBGEMM_GENAI ON)
+endif()
+
 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
 # Eff Attention won't
 cmake_dependent_option(
@@ -898,7 +920,7 @@ cmake_dependent_option(
 # USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
 #
 if(USE_ROCM)
-  if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
+  if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
     include(cmake/External/aotriton.cmake)
   endif()
 endif()
@@ -1368,10 +1390,6 @@ endif()
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
 
-if(BUILD_FUNCTORCH)
-  add_subdirectory(functorch)
-endif()
-
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
   string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@@ -1410,3 +1428,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
   install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
           DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()
+
+if(USE_PRIORITIZED_TEXT_FOR_LD)
+  add_compile_options(
+    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
+    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
+  )
+  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
+  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
+
+  add_custom_command(
+    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
+    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
+    COMMENT "Generating prioritized text linker files"
+    VERBATIM
+  )
+
+  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+
+  if(BUILD_PYTHON)
+    set(LINKER_OPT_TARGETS torch_python)
+  endif()
+
+  if(NOT BUILD_LIBTORCHLESS)
+    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
+    if(USE_CUDA)
+      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
+    endif()
+    if(USE_XPU)
+      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
+    endif()
+    if(USE_ROCM)
+      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
+    endif()
+  endif()
+
+  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
+    if(TARGET ${tgt})
+      add_dependencies("${tgt}" generate_linker_script)
+      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
+      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+    else()
+       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
+    endif()
+  endforeach()
+
+else()
+  if(LINUX AND CPU_AARCH64)
+    message(WARNING [[
+    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
+    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+    ]])
+  endif()
+endif()
diff --git a/README.md b/README.md
index 99e6dabd16181..4356491e178e7 100644
--- a/README.md
+++ b/README.md
@@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 
 #### Prerequisites
 If you are installing from source, you will need:
-- Python 3.9 or later
+- Python 3.10 or later
 - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
 - Visual Studio or Visual Studio Build Tool (Windows only)
 
diff --git a/RELEASE.md b/RELEASE.md
index 047bb10161f71..52371e73f0a6d 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
 
 | PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
 | --- | --- | --- | --- | --- | --- |
+| 2.9 | >=3.10, <=(3.14, 3.14t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 13.0 (CUDNN 9.13.0.50) | ROCm 6.4 |
 | 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 |
 | 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 |
 | 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 |
diff --git a/SECURITY.md b/SECURITY.md
index 3baa145df7953..16d72ef1ea08e 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -16,6 +16,8 @@ However, if you believe you have found a security vulnerability in PyTorch, we e
 
 Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
 
+All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
+
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
 
 https://www.facebook.com/whitehat
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index a3c98f37a0242..b9f8995082ccf 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -270,6 +270,14 @@ IF(USE_FBGEMM_GENAI)
       "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
     list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX})
 
+    # PyTorch is not built for 10.0a in CI, due to lack of portability,
+    # so we need to explicitly build these files for 10.0a.
+    foreach(cu_file ${fbgemm_genai_native_cuda_cu})
+      _BUILD_FOR_ADDITIONAL_ARCHS(
+        "${cu_file}"
+        "100a")
+    endforeach()
+
     file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp
       "${FBGEMM_GENAI_SRCS}/common/*.cpp"
     )
@@ -315,10 +323,20 @@ IF(USE_FBGEMM_GENAI)
         -greedy-reverse-local-assignment=1
         -fhip-new-launch-api)
 
+      # Only compile for gfx942 for now.
+      # This is rather hacky, I could not figure out a clean solution :(
+      set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
+      string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
+      if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+        list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
+      endif()
+      set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
+
       hip_add_library(
         fbgemm_genai STATIC
         ${fbgemm_genai_native_rocm_hip}
         HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+      set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
       set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
       target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
 
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 4d48084b0ab89..7a8d02be530e3 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
 }
 
 bool Context::allowTF32CuDNN(const std::string& op) const {
-  if (op.size() == 0){
+  if (op.empty()){
     bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
     bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
     TORCH_CHECK(
@@ -281,9 +281,6 @@ bool Context::userEnabledOverrideableSDP() const {
 
 static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
 static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
-#ifdef USE_ROCM
-static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
-#endif
 
 bool Context::checkCuBLASConfigDeterministic() {
   // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
@@ -343,12 +340,6 @@ void Context::setImmediateMiopen(bool b) {
 }
 
 bool Context::allowTF32CuBLAS() const {
-#ifdef USE_ROCM
-    const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
-    if (allow_tf32 != true) {
-      return false;
-    }
-#endif
   bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
   bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
   TORCH_CHECK(
@@ -362,14 +353,6 @@ bool Context::allowTF32CuBLAS() const {
 }
 
 void Context::setAllowTF32CuBLAS(bool b) {
-#ifdef USE_ROCM
-  const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
-  if (allow_tf32 != true) {
-    C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
-                              << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
-    return;
-  }
-#endif
   float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
   setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
 }
@@ -443,7 +426,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
     std::string msg;
     auto iterp = _fp32_precisions.find(backend);
     TORCH_CHECK(iterp != _fp32_precisions.end());
-    for (auto p : iterp->second) {
+    for (const auto& p : iterp->second) {
       msg += p;
       msg += " ";
     }
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 98ad757946bec..7c2ad5c609e7b 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -65,14 +65,24 @@ DLDataType getDLDataType(const Tensor& t) {
       break;
     // TODO(#146647): use macro here instead of spelling out each shell dtype
     case ScalarType::Float8_e5m2:
+      dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
+      break;
     case ScalarType::Float8_e5m2fnuz:
+      dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
+      break;
     case ScalarType::Float8_e4m3fn:
+      dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
+      break;
     case ScalarType::Float8_e4m3fnuz:
+      dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
+      break;
     case ScalarType::Float8_e8m0fnu:
-      TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
+      dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu;
       break;
     case ScalarType::Float4_e2m1fn_x2:
-      TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
+      dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
+      dtype.lanes = 2;
+      dtype.bits = 4;
       break;
     case ScalarType::QInt8:
     case ScalarType::QUInt8:
@@ -177,7 +187,11 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
 
 ScalarType toScalarType(const DLDataType& dtype) {
   ScalarType stype = ScalarType::Undefined;
-  TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
+  if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
+    TORCH_CHECK_BUFFER(
+        dtype.lanes == 1,
+        "ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
+  }
   switch (dtype.code) {
     case DLDataTypeCode::kDLUInt:
       switch (dtype.bits) {
@@ -269,6 +283,73 @@ ScalarType toScalarType(const DLDataType& dtype) {
               false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
       }
       break;
+    case DLDataTypeCode::kDLFloat8_e5m2:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e5m2;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat8_e5m2fnuz:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e5m2fnuz;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat8_e4m3fn:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e4m3fn;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat8_e4m3fnuz:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e4m3fnuz;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat8_e8m0fnu:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e8m0fnu;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat4_e2m1fn:
+      switch (dtype.bits) {
+        case 4:
+          switch (dtype.lanes) {
+            case 2:
+              stype = ScalarType::Float4_e2m1fn_x2;
+              break;
+            default:
+              TORCH_CHECK_BUFFER(
+                false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
+          }
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
+      }
+      break;
     default:
       TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
   }
@@ -320,30 +401,13 @@ T* toDLPackImpl(const Tensor& src) {
   // The following code detects whether the src follows
   // a continuous pattern. If the src follows such pattern (common-case)
   // then we do not need to normalize the strides.
-  bool need_normalize_strides = false;
-  int64_t expected_stride = 1;
-  for (int i = src.dim() - 1; i >= 0; i--) {
-    // detect if we do not meet continuous pattern
-    // and the size is 1, so there is opportunity to normalize
-    if (src.stride(i) != expected_stride && src.size(i) == 1) {
-      need_normalize_strides = true;
-      break;
-    }
-    expected_stride *= src.size(i);
-  }
-
+  bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
   // less common case, try normalizing the strides
   if (need_normalize_strides) {
     // create a new tensor with possibly normalized strides
     // gh-83069
     auto shape = src.sizes();
-    auto strides = src.strides().vec();
-    for (int i = 0; i < src.dim(); i++) {
-      if (shape[i] < 2) {
-        strides[i] = 1;
-      }
-    }
-    view = src.as_strided(shape, strides, src.storage_offset());
+    view = src.as_strided(shape, {1}, src.storage_offset());
   }
 
   ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
@@ -354,8 +418,8 @@ T* toDLPackImpl(const Tensor& src) {
   atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
   atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
   atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
-  atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
-  atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
+  atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
+  atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
   atDLMTensor->tensor.dl_tensor.byte_offset = 0;
   fillVersion(&atDLMTensor->tensor);
 
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index a5512818343fb..2cf8d9727f658 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
   // SparseTensorImpl has no storage, so we cannot query its nbytes.
   // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
   // Same for XLA
-  if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
+  if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
     original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
   } else {
     original_storage_size_ = -1;
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 7d5e4e84e861d..0a2fa153a6cf1 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
   : c10::TensorImpl(
       c10::DispatchKeySet(DispatchKey::Functionalize),
       view_value.dtype(),
-      view_value.device()
+      base->storage().data_ptr().device()
     ),
     value_(view_value),
     is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
@@ -485,7 +485,10 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI
 
 
 c10::Device FunctionalTensorWrapper::device_custom() const {
-  return value_.unsafeGetTensorImpl()->device();
+  // The storage pointer already uses the underlying tensor custom device (if
+  // applicable) to extract the device. So, we dont have to recurse again by
+  // doing value_.unsafeGetTensorImpl()->device().
+  return storage().data_ptr().device();
 }
 at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
   return value_.unsafeGetTensorImpl()->sizes();
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 39f77664de864..b10795fbc37eb 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -133,12 +133,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
         "resize_ called on tensor with symbolic shape")
     TORCH_CHECK(
         sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
-        "number of dimensions must be sparse_dim (",
+        "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
+        size.size(),
+        ", sparse_dim = ",
         sparse_dim,
-        ") + dense_dim (",
-        dense_dim,
-        "), but got ",
-        size.size());
+        ", dense_dim = ",
+        dense_dim);
     if (nnz() > 0) {
       [[maybe_unused]] auto constexpr alt_options_msg =
           "You could try the following options:\n\
@@ -254,12 +254,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
         "resize_and_clear_ called on tensor with symbolic shape")
     TORCH_CHECK(
         sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
-        "number of dimensions must be sparse_dim (",
+        "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
+        size.size(),
+        ", sparse_dim = ",
         sparse_dim,
-        ") + dense_dim (",
-        dense_dim,
-        "), but got ",
-        size.size());
+        ", dense_dim = ",
+        dense_dim);
 
     set_sizes_and_strides(size, std::vector<int64_t>(size.size()));
     sparse_dim_ = sparse_dim;
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 0d319ea593840..07d5ae5d9886e 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -644,6 +644,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
   void * beta_ptr = &fbeta;
 #ifdef USE_ROCM
   int flag = 0;
+  rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
+  rocblas_datatype d_type = c_type;
 #if USE_GEMM_FLAGS_FP16_ALT_IMPL
   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
 #endif
@@ -652,8 +654,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
                                    hipOperationToRocOperation(opb), (int)m, (int)n, (int)k,
                                    (void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea,
                                    b, rocblas_datatype_f16_r, (int)ldb, strideb,
-                                   (void*)beta_ptr, c, rocblas_datatype_f16_r, (int)ldc, stridec,
-                                   c, rocblas_datatype_f16_r, (int)ldc, stridec,
+                                   (void*)beta_ptr, c, c_type, (int)ldc, stridec,
+                                   c, d_type, (int)ldc, stridec,
                                    (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
                                    0, flag)));
 #else
@@ -1096,6 +1098,8 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
   GEMM_CHECK_ARGVALUES(at::Half);
 #ifdef USE_ROCM
   int flag = 0;
+  rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
+  rocblas_datatype d_type = c_type;
 #if USE_GEMM_FLAGS_FP16_ALT_IMPL
   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
 #endif
@@ -1115,10 +1119,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
       ldb,
       beta_ptr,
       c,
-      rocblas_datatype_f16_r,
+      c_type,
       ldc,
       c,
-      rocblas_datatype_f16_r,
+      d_type,
       ldc,
       rocblas_datatype_f32_r,
       rocblas_gemm_algo_standard,
@@ -1633,9 +1637,7 @@ bool gemm_and_bias(
   if (activation == GEMMAndBiasActivationEpilogue::RELU) {
     epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
   } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
-#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
     epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
-#endif
   }
 
   if (bias != nullptr) {
@@ -1927,7 +1929,6 @@ void scaled_gemm(
     bool use_fast_accum) {
   // Note: see `cublasCommonArgs` for various non-intuitive manupulations
   // of input arguments to this function.
-#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
   const auto computeType = CUBLAS_COMPUTE_32F;
   const auto scaleType = CUDA_R_32F;
   const float alpha_val = 1.0;
@@ -1950,8 +1951,8 @@ void scaled_gemm(
   #if ROCM_VERSION >= 70000
             if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
                 // TODO: add constraints based on hipblaslt internals
-                TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
-                           "Matrix dimensions must be multiples of 32 for MX format. "
+                TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
+                           "M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
                            "Got m=", m, ", n=", n, ", k=", k);
             }
   #endif
@@ -2129,8 +2130,6 @@ void scaled_gemm(
       " scaleType ",
       scaleType);
   return;
-#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
-  TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
 }
 
 void int8_gemm(
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index 422890084c900..f95faa94e6113 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -266,11 +266,14 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(
  * See Note [Acquire lock when using random generators]
  */
 void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
-  at::cuda::assertNotCapturing(
-      "Cannot call CUDAGeneratorImpl::set_current_seed");
-  state_->seed_ = seed;
-  state_->philox_offset_per_thread_ = 0;
-  no_reset_rnn_state_.clear();
+  if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
+    state_->seed_ = seed;
+    state_->philox_offset_per_thread_ = 0;
+    no_reset_rnn_state_.clear();
+  } else {
+    TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
+    // no-op case
+  }
 }
 
 /**
@@ -299,9 +302,6 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
  * Gets the current seed of CUDAGeneratorImpl.
  */
 uint64_t CUDAGeneratorImpl::current_seed() const {
-  // Debatable if current_seed() should be allowed in captured regions.
-  // Conservatively disallow it for now.
-  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
   return state_->seed_;
 }
 
@@ -346,8 +346,6 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  * and size of the internal state.
  */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
-  at::cuda::assertNotCapturing(
-      "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
   static const size_t seed_size = sizeof(uint64_t);
   static const size_t offset_size = sizeof(int64_t);
   static const size_t total_size = seed_size + offset_size;
@@ -402,15 +400,27 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
  */
 void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
   // see Note [Why enforce RNG offset % 4 == 0?]
+
+  // Note: If you use CUDNN RNN's, calling
+  // set_philox_offset_per_thread instead of set_offset will cause the
+  // cudnn RNN rng state to become stale.
   TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
-  state_->philox_offset_per_thread_ = offset;
+  if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
+    state_->philox_offset_per_thread_ = offset;
+  } else {
+    state_->offset_intragraph_ = offset;
+  }
 }
 
 /**
  * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
  */
 uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
-  return state_->philox_offset_per_thread_;
+  if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
+    return state_->philox_offset_per_thread_;
+  } else {
+    return state_->offset_intragraph_;
+  }
 }
 
 /**
diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h
index 82c0668211188..f1b3ae2b7760b 100644
--- a/aten/src/ATen/dlpack.h
+++ b/aten/src/ATen/dlpack.h
@@ -19,7 +19,7 @@
 #define DLPACK_MAJOR_VERSION 1
 
 /*! \brief The current minor version of dlpack */
-#define DLPACK_MINOR_VERSION 0
+#define DLPACK_MINOR_VERSION 1
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@@ -32,9 +32,7 @@
 #define DLPACK_DLL
 #endif
 
-// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stdint.h>
-// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stddef.h>
 
 #ifdef __cplusplus
@@ -159,6 +157,26 @@ typedef enum {
   kDLComplex = 5U,
   /*! \brief boolean */
   kDLBool = 6U,
+  /*! \brief FP8 data types */
+  kDLFloat8_e3m4 = 7U,
+  kDLFloat8_e4m3 = 8U,
+  kDLFloat8_e4m3b11fnuz = 9U,
+  kDLFloat8_e4m3fn = 10U,
+  kDLFloat8_e4m3fnuz = 11U,
+  kDLFloat8_e5m2 = 12U,
+  kDLFloat8_e5m2fnuz = 13U,
+  kDLFloat8_e8m0fnu = 14U,
+  /*! \brief FP6 data types
+   * Setting bits != 6 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat6_e2m3fn = 15U,
+  kDLFloat6_e3m2fn = 16U,
+  /*! \brief FP4 data types
+   * Setting bits != 4 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat4_e2m1fn = 17U,
 } DLDataTypeCode;
 
 /*!
@@ -172,6 +190,12 @@ typedef enum {
  *   - int8: type_code = 0, bits = 8, lanes = 1
  *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
  *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+ *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
+ *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
+ *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
+ *
+ *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
+ *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
  */
 typedef struct {
   /*!
@@ -229,12 +253,12 @@ typedef struct {
   /*! \brief The data type of the pointer*/
   DLDataType dtype;
   /*! \brief The shape of the tensor */
-  const int64_t* shape;
+  int64_t* shape;
   /*!
    * \brief strides of the tensor (in number of elements, not bytes)
    *  can be NULL, indicating tensor is compact and row-majored.
    */
-  const int64_t* strides;
+  int64_t* strides;
   /*! \brief The offset in bytes to the beginning pointer to data */
   uint64_t byte_offset;
 } DLTensor;
@@ -269,7 +293,7 @@ typedef struct DLManagedTensor {
   void (*deleter)(struct DLManagedTensor * self);
 } DLManagedTensor;
 
-// bit masks used in in the DLManagedTensorVersioned
+// bit masks used in the DLManagedTensorVersioned
 
 /*! \brief bit mask to indicate that the tensor is read only. */
 #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
@@ -282,6 +306,14 @@ typedef struct DLManagedTensor {
  */
 #define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
 
+/*
+ * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
+ *
+ * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
+ * be set by the producer to signal that a tensor of sub-byte type is padded.
+ */
+#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
+
 /*!
  * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
  *
diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
index b26d2c4a419e5..48a735c3e5332 100644
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@@ -171,6 +171,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 
   POINTWISE_BOXED(fill_.Scalar);
   POINTWISE_BOXED(zero_);
+  // This is special because this op doesn't return anything
+  m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);
 
 #undef UNARY_POINTWISE
 #undef UNARY_POINTWISE_ALL
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index e06afddd05aa7..20be0d6fe017a 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -457,24 +457,9 @@ void gemm(
     return;
   }
 #endif
-  // for the fallback path, first compute gemm with beta = 0,
-  // and then add c in full precision.
-  int64_t c_size = n * m;
-  std::vector<float> float_c(c_size, 0.f);
   gemm_no_downcast_stub(
       at::kCPU, at::kBFloat16,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
-  for (const auto j : c10::irange(n)) {
-    for (const auto i : c10::irange(m)) {
-      auto offset = j * ldc + i;
-      // beta == 0 won't propagate NaN from C
-      if (beta == 0.f) {
-        c[offset] = float_c[j * m + i];
-      } else {
-        c[offset] = beta * c[offset] + float_c[j * m + i];
-      }
-    }
-  }
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void gemm(
@@ -493,24 +478,9 @@ void gemm(
     return;
   }
 #endif
-  // for the fallback path, first compute gemm with beta = 0,
-  // and then add c in full precision.
-  int64_t c_size = n * m;
-  std::vector<float> float_c(c_size, 0.f);
   gemm_no_downcast_stub(
       at::kCPU, at::kHalf,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
-  for (const auto j : c10::irange(n)) {
-    for (const auto i : c10::irange(m)) {
-      auto offset = j * ldc + i;
-      // beta == 0 won't propagate NaN from C
-      if (beta == 0.f) {
-        c[offset] = float_c[j * m + i];
-      } else {
-        c[offset] = beta * c[offset] + float_c[j * m + i];
-      }
-    }
-  }
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void gemm(
diff --git a/aten/src/ATen/native/ChanelShuffle.cpp b/aten/src/ATen/native/ChanelShuffle.cpp
index 64fdd56c0e665..d043014b3820e 100644
--- a/aten/src/ATen/native/ChanelShuffle.cpp
+++ b/aten/src/ATen/native/ChanelShuffle.cpp
@@ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
   // TODO: contiguous can be made to preserve the memory format
   // of the input. However since the above reshape clobbers h and w
   // it may not be safe to do that, since channels_last contiguous
-  // may think oc and and the last dim correspond to h,w?
+  // may think oc and the last dim correspond to h,w?
   // It is not clear, however from initial looking around it feels that
   // this may not be correct.
   // In this case channels last will likely require custom implementation
diff --git a/aten/src/ATen/native/EmbeddingBag.h b/aten/src/ATen/native/EmbeddingBag.h
index eb29e1171dcd6..a344422204844 100644
--- a/aten/src/ATen/native/EmbeddingBag.h
+++ b/aten/src/ATen/native/EmbeddingBag.h
@@ -1,3 +1,4 @@
+#pragma once
 #include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <cstdint>
diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp
index 5ff1e6b61ed20..8e04a7490e879 100644
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@@ -97,43 +97,38 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
   int64_t nDims = self.dim();
   TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");
 
-  int64_t height = self.size(0);
-  int64_t width = self.size(1);
+  auto height = self.sym_size(0);
+  auto width = self.sym_size(1);
 
   if (nDims > 2) {
-    int64_t dim1 = height;
     for (const auto i : c10::irange(1, nDims)) {
-      if (self.size(i) != dim1) {
+      if (self.sym_size(i) != height) {
         TORCH_CHECK(false, "all dimensions of input must be of equal length");
       }
     }
   }
 
-  int64_t storage_offset = self.storage_offset();
-  std::vector<int64_t> sizes;
-  std::vector<int64_t> strides;
-  int64_t size = std::min(height, width);
+  auto storage_offset = self.sym_storage_offset();
+  auto size = std::min(height, width);
 
   int64_t stride = 0;
   for (const auto i : c10::irange(nDims)) {
     stride += self.stride(i);
   }
-  strides.push_back(stride);
-  sizes.push_back(size);
+  std::vector<SymInt> strides{stride};
+  std::vector<SymInt> sizes{size};
 
-  auto main_diag = self.as_strided(sizes, strides, storage_offset);
+  auto main_diag = self.as_strided_symint(sizes, strides, storage_offset);
   main_diag.fill_(fill_value);
 
   if (wrap && nDims == 2 && height > width + 1) {
-    std::vector<int64_t> wrap_sizes;
+    auto step = width + 1;
+    auto wrap_size = ((self.numel() + step - 1) / step) - size;
+    std::vector<SymInt> wrap_sizes{wrap_size};
 
-    int64_t step = width + 1;
-    int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
-    wrap_sizes.push_back(wrap_size);
+    auto offset = self.stride(0) * (width + 1);
 
-    int64_t offset = self.stride(0) * (width + 1);
-
-    auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
+    auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset);
     wrap_diag.fill_(fill_value);
   }
 
diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp
index d1fa7092f5f15..68328018b24b4 100644
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@@ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
   int64_t inputH = input_.size(heightDim);
   int64_t inputW = input_.size(widthDim);
 
-  TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
+  TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT),
            "fractional_max_pool3d_out(): pool time ", poolSizeT,
            " too large relative to input time ", inputT);
-  TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
+  TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW),
            "fractional_max_pool3d_out(): pool width ", poolSizeW,
            " too large relative to input width ", inputW);
-  TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
+  TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH),
            "fractional_max_pool3d_out(): pool height ", poolSizeH,
            " too large relative to input height ", inputH);
 
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index b62c584641dba..616e6ec60e13d 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1360,7 +1360,8 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
 #endif
 
 
-#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
+#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
+// Used by default on x86 platforms and on AArch64+ACL
 static inline int64_t get_mkldnn_matmul_min_dim() {
   static auto value = [&] {
     const int64_t default_min_dim = [&] {
@@ -1395,8 +1396,6 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
   return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
 }
 #endif
-
-
 static void addmm_impl_cpu_(
     Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
   TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
@@ -1772,8 +1771,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
     return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
         (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
   };
-
-#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
+#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
+  // Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
   bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
   if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
     try {
@@ -1785,7 +1784,6 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
     }
   }
 #endif
-
   if (contraction_size * res_rows * res_cols < 400) {
     if (is_bmm_out) {
       AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] {
diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp
index a71db5e8ef8d1..f91b892efec21 100644
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@@ -23,8 +23,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
   // Nondeterministic with duplicate indices
   at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
 
-  auto oheight = output_size[0];
-  auto owidth = output_size[1];
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64 but got: ", indices_.scalar_type());
@@ -45,6 +43,9 @@ Tensor& max_unpooling2d_forward_out_cpu(
                 self_.sizes(), " with dimension ", i , " being empty.");
   }
 
+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
   auto memory_format = self_.suggest_memory_format();
   auto self = self_.contiguous(memory_format);
   auto indices = indices_.contiguous(memory_format);
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index ac1086c6b6bd3..229d504b0a386 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -671,7 +671,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
       std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (calling miopen_batch_norm)" << std::endl;
     return std::tuple_cat(
              at::miopen_batch_norm(
-               input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(),
+               input.contiguous(input.suggest_memory_format()),
+               weight.contiguous(),
+               bias.contiguous(),
                running_mean.defined() ? running_mean.contiguous() : running_mean,
                running_var.defined() ? running_var.contiguous() : running_var,
                training, momentum, eps),
diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp
index 8099648d37b29..3c00a16108c12 100644
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@@ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
     for (const auto i : c10::irange((size_t)l_pad)) {
         auto pad_idx = pad.size() - ((i + 1) * 2);
         auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
-        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+        TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
                  pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
                  "which is invalid. Check dimension ", l_diff + i, " of your input.");
         new_shape.emplace_back(new_dim);
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 408faea1b7644..7d613fc023120 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
   if (self.dim() == 1 || broadcast_index) {
     Tensor squeezed = index;
     if (broadcast_index && index.dim() > 1) {
-      for (const auto d : c10::irange(index.dim())) {
+      for (int64_t d = index.dim() - 1; d >= 0; --d) {
         if (d == dim) {
           continue;
         }
diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp
index 47264c45205c0..08b666e296ed7 100644
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@@ -52,6 +52,7 @@ void apply_triu_tril_single(
     int64_t self_col_stride,
     bool upper) {
   constexpr int64_t zero = 0;
+  k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN.
 
   if (upper) {
     parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
index c775bc756145a..fca7d8bdce5ae 100644
--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@@ -85,11 +85,11 @@ void cpu_max_unpool(
     if constexpr (is_3d) {
       TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
           " (output volumes are of size ", output_depth,
-          "x", output_height, "x", output_width);
+          "x", output_height, "x", output_width, ")");
     } else {
       TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
           " (output volumes are of size ", output_height,
-          "x", output_width);
+          "x", output_width, ")");
     }
   }
 
diff --git a/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu b/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu
index 8a3326fddb8a9..fcacef37ceaf0 100644
--- a/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu
+++ b/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu
@@ -36,7 +36,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) {
             [zero, one_sixth, three, six] GPU_LAMBDA(
                 scalar_t self_val) -> scalar_t {
               opmath_t x = static_cast<opmath_t>(self_val);
-              return std::min(std::max(x + three, zero), six) * one_sixth;
+              return std::min<opmath_t>(std::max<opmath_t>(x + three, zero), six) * one_sixth;
             });
       });
 }
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index fcaae32e773f1..652dc8e121d0d 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1081,16 +1081,6 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals
 #endif
 }
 
-static bool _grouped_mm_allowed_device() {
-#ifdef USE_ROCM
-    return false;
-#else
-    auto dprops = at::cuda::getCurrentDeviceProperties();
-    // CUDA capability 8.0 and greater
-    return dprops->major >= 8;
-#endif
-}
-
 #ifdef USE_ROCM
 static bool _scaled_mm_is_fnuz() {
     return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
@@ -1149,9 +1139,14 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
 bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
   // TODO: We might want to enforce some structure on the shapes of the scale
   // tensors
-  return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
-      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
-      && scale.is_contiguous());
+  bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
+      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
+  bool is_packed_fp4_path = false;
+#ifdef USE_ROCM
+  is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
+      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
+#endif
+  return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
 }
 
 bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
@@ -1392,9 +1387,15 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
     TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
                 "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
 
-    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
-                mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
-                "Matrix dimensions must be multiples of 32 for block-wise scaling");
+    int packed_factor = 1;
+    if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
+      // For float4 data type, each byte stores two 4-bit floating-point values,
+      // effectively packing two elements into one byte.
+      packed_factor = 2;
+    }
+    TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
+                mat2.size(1) % 16 == 0,
+                "M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");
 
     TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
                 out.scalar_type() == ScalarType::Half,
@@ -1787,14 +1788,19 @@ Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
 const std::optional<at::Tensor>& offs,
 const std::optional<at::Tensor>& bias,
 std::optional<c10::ScalarType> out_dtype) {
-#ifndef USE_ROCM
   _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
   bool a_b_and_out_are_bf16 = (
     mat_a.dtype() == at::kBFloat16 &&
     mat_b.dtype() == at::kBFloat16 &&
     out_dtype.value_or(at::kBFloat16) == at::kBFloat16
   );
+#ifndef USE_ROCM
   bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
+#else
+  // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
+  // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
+  bool use_fast_path = false;
+#endif
   const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
   Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
   if (use_fast_path) {
@@ -1804,9 +1810,6 @@ std::optional<c10::ScalarType> out_dtype) {
     _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
   }
   return out;
-#else
-  TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
-#endif
 }
 
 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
index 7ee02b02b41f1..227d42247ebd9 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@@ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op(
       Op<opmath_t>(),
       alpha.to<opmath_t>());
 
-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
index 80d748dd3579b..9ac0e875b2d68 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@@ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op(
           /* res_arg_index */ 1>(),
       Op<opmath_t>(),
       scalar.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
index dcb93188b5e69..b28aa690630b4 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op(
   }
 
   tensor_lists.emplace_back(tensors.vec());
-  tensor_lists.emplace_back(vec_res);
+  tensor_lists.emplace_back(std::move(vec_res));
 
   using opmath_t = at::opmath_type<T>;
   multi_tensor_apply<2, opmath_t>(
@@ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op(
           /* res_arg_index */ 1>(),
 
       Op<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
index ad5eeee5ebec4..bc6bd37891258 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
@@ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op(
       Op<opmath_t>(),
       scalar.data_ptr<T>(),
       alpha.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index 7a3276c44750a..7f563f55d5565 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op(
             scalar.to<opmath_t>());
       });
 
-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }
 
 template <template <class> class Op>
@@ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op(
             Op<opmath_t>());
       });
 
-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }
 
 #define FOREACH_POINTWISE_OP_SCALAR(NAME, OP)                           \
diff --git a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
index a6599287f3d6d..313aa567bb055 100644
--- a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
@@ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
     vec_res.emplace_back(at::native::empty_like(t));
   }
   std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
@@ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
             LerpFunctor<opmath_t>());
       });
 
-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }
 
 void foreach_tensor_lerp_ternary_cuda_(
@@ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
     vec_res.emplace_back(at::native::empty_like(t));
   }
   std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
@@ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
             weight.to<opmath_t>());
       });
 
-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }
 
 void foreach_tensor_lerp_list_cuda_(
@@ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
     vec_res.emplace_back(at::native::empty_like(t));
   }
   std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
@@ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
             LerpFunctor<opmath_t>());
       });
 
-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }
 
 void foreach_tensor_lerp_scalarlist_cuda_(
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index e04a7939d9392..bb070f9d97616 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
           /* res_arg_index */ 1>(),
       Op<opmath_t>());
 
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }
 
 template <typename scalar_t, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu
index 1109d7b3e6a06..c270a8432ff30 100644
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@@ -125,8 +125,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64 but got: ", indices_.scalar_type());
-  auto oheight = output_size[0];
-  auto owidth = output_size[1];
 
   TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
       indices_arg{indices_, "indices_", 3};
@@ -149,6 +147,9 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
       output_size.size() == 2,
       "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");
 
+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
   int64_t dimw = 2;
   int64_t dimh = 1;
   int64_t numBatch = 1;
@@ -217,9 +218,6 @@ static void max_unpooling3d_shape_check(
     IntArrayRef stride,
     IntArrayRef padding,
     const char *fn_name) {
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
   TORCH_CHECK(
       indices.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64 but got: ", indices.scalar_type());
@@ -250,6 +248,10 @@ static void max_unpooling3d_shape_check(
       "strides should be greater than zero, but got stride: ",
       stride);
 
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
   int dimw = 3;
   int dimh = 2;
   int dimt = 1;
@@ -402,8 +404,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
     const Tensor& indices_,
     IntArrayRef output_size,
     Tensor& grad_input) {
-  int64_t oheight = output_size[0];
-  int64_t owidth = output_size[1];
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
@@ -426,6 +426,9 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
 
   TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());
 
+  int64_t oheight = output_size[0];
+  int64_t owidth = output_size[1];
+
   int64_t nInputCols, nInputRows, nInputPlane;
 
   int dimw = 2;
@@ -505,13 +508,14 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
     IntArrayRef padding,
     Tensor& grad_input) {
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
 
   max_unpooling3d_shape_check(
     self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
 
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
   int batchSize = 0;
   int inputSlices = 0;
   int inputTime = 0;
diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu
index 2d0e32d4e8c05..8811f8dc5117e 100644
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@@ -300,8 +300,6 @@ void nonzero_static_cuda_out_impl(
     int64_t size,
     int64_t fill_value,
     Tensor& out) {
-#if defined(CUDA_VERSION) || defined(USE_ROCM)
-
   Tensor self_contiguous_ = self.contiguous();
   // see comment in nonzero_cuda_out_impl on reqs for out
   bool out_correct_size =
@@ -317,6 +315,17 @@ void nonzero_static_cuda_out_impl(
     out_temp =
         Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
   }
+  // If input has zero elements, avoid kernel grid calculations (which can
+  // produce zero divisors) and just fill the output with fill_value.
+  if (self.numel() == 0) {
+    if (need_to_copy) {
+      out_temp.fill_(fill_value);
+      out.copy_(out_temp);
+    } else {
+      out.fill_(fill_value);
+    }
+    return;
+  }
   int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
                                        : out.mutable_data_ptr<int64_t>();
 
@@ -366,9 +375,6 @@ void nonzero_static_cuda_out_impl(
   if (need_to_copy) {
     out.copy_(out_temp);
   }
-#else
-  TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
-#endif
 }
 
 Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 9914ba3a01564..4e1ddb57fc0f4 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -416,6 +416,7 @@ struct ReduceOp {
     if (config.should_block_y_reduce()) {
       value = block_y_reduce<output_vec_size>(value, shared_memory);
     }
+    __syncthreads();
     if (config.should_block_x_reduce()) {
       value = block_x_reduce<output_vec_size>(value, shared_memory);
     }
diff --git a/aten/src/ATen/native/cuda/Repeat.cu b/aten/src/ATen/native/cuda/Repeat.cu
index 1e2364ae50913..e131081af153d 100644
--- a/aten/src/ATen/native/cuda/Repeat.cu
+++ b/aten/src/ATen/native/cuda/Repeat.cu
@@ -17,12 +17,11 @@ __global__ static void compute_cuda_kernel(
     index_t* result_ptr,
     int64_t size,
     int64_t result_size) {
-  if (C10_UNLIKELY((result_size != cumsum_ptr[size - 1]))) {
-    printf("%s:%d:%s: block: [%d,%d,%d], thread: [%d,%d,%d] "
+  CUDA_KERNEL_ASSERT_PRINTF(
+      result_size == cumsum_ptr[size - 1],
       "Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n",
-      __FILE__, __LINE__, __func__,blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, result_size, cumsum_ptr[size - 1 ]);
-    CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1])
-  }
+      result_size,
+      cumsum_ptr[size - 1]);
 
   int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
   int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index e2eb2226acf4a..c1a4f47ccd983 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -226,6 +226,38 @@ __global__ void CatArrayBatchedCopy_contig(
     }
 }
 
+
+template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
+__global__ void CatArrayBatchedCopy_vectorized(
+    char* output,
+    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
+    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType trailingSize) {
+
+    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
+    IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
+
+    if(tid >= nElements) return;
+
+    const char * data = (char*)inputs.input[blockIdx.y];
+    IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
+    IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
+    int64_t dataOffset = (int64_t)offset  * alignment; // in bytes
+
+    IndexType stride = gridDim.x * blockDim.x;
+
+    while( tid < nElements){
+      int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
+                    os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
+      auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
+      at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
+      tid += stride;
+    }
+}
+
+
+
 /*
   Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
   to improve memory bandwidth throughput.
@@ -296,12 +328,27 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
   scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
   CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
   TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
+  // If all batches are contiguous we can call a specialized implementation
+  // which requires the input tensor addresses to be aligned to a
+  // 16 Byte boundary.
+
+  constexpr bool isContig = stride_size == 1;
+  bool isAligned = true;
+  constexpr int alignment = 16;
 
   // Next, let's initialize the size, stride arrays for the output Tensor.
+  // for contig case, we'll canonicalize output strides, so that
+  // we don't have arbitrary strides for dims of size 0
+  size_t stride0 = 1;
   if (memory_format == c10::MemoryFormat::Contiguous) {
-    for (int i = 0; i < nDims; ++i) {
+    for (int i = nDims - 1; i >= 0; --i) {
       outputParam.tensorSize[i] = out.size(i);
-      outputParam.tensorStride[i] = out.stride(i);
+      if (isContig) {
+        outputParam.tensorStride[i] = stride0;
+        stride0 *= out.size(i);
+      } else {
+        outputParam.tensorStride[i] = out.stride(i);
+      }
     }
   } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
     // permute the semantics of dims from NCHW to NHWC so that the input
@@ -320,12 +367,15 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 
   at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
 
-  // If all batches are contiguous we can call a specialized implementation
-  // which requires the input tensor addresses to be aligned to a
-  // 16 Byte boundary.
 
-  bool isContig = true;
-  bool isAligned = true;
+  // for channels last computing slice size correctly is much more involved, so we never send it
+  // on the fully vectorized path
+  // we need output stride in cat dimension to be multiple of alignment,
+  // if we ever use it to compute offsets
+  // for catting in 0th dimension it doesn't matter
+  bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
+                        memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
+                        outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
   unsigned int max_elements_per_tensor = 0;
 
   // Now we loop
@@ -341,6 +391,16 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       // high-dimensional tensor
       if (inputs[i+batchCounter].get().numel() > 0) {
         dimSize = inputs[i+batchCounter].get().size(dimension);
+        if (isInOutAligned) {
+          auto t = inputs[i+batchCounter].get();
+          // similarly to output stride, we cannot trust stride value to
+          // determine slice size if the corresponding dimension is 1
+          // we have to multiply all the subsequent sizes
+          int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
+             t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
+          slice_size *= sizeof(scalar_t);
+          isInOutAligned &= (slice_size % alignment == 0);
+        }
       }
 
       catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@@ -351,10 +411,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 #ifdef USE_ROCM
       // On ROCm, CatArrayBatchedCopy_contig is faster
       isAligned = false;
+      isInOutAligned = false;
 #else
       // If at least one of the inputs is not aligned, we can't call the
       // CatArrayBatchedCopy_alignedK_contig
       isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
+      isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
 #endif
 
       if (stride_size > 1) {
@@ -365,7 +427,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
           catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
         }
         catMetaData.isContiguous[batchCounter] = false;
-        isContig = false;
       } else {
         catMetaData.isContiguous[batchCounter] = true;
       }
@@ -388,10 +449,13 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
           max_elements_per_tensor, batchCounter);
 #else
     dim3 applyBlock, catGrid;
-    if (isContig && sizeof(scalar_t) > 2) {
+    if (isInOutAligned) {
+      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
+        max_elements_per_tensor, batchCounter);
+    } else if (isContig && isAligned && sizeof(scalar_t) > 2) {
       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
           max_elements_per_tensor, batchCounter);
-    } else if (isContig && sizeof(scalar_t) == 2) {
+    } else if (isContig && isAligned && sizeof(scalar_t) == 2) {
       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
           max_elements_per_tensor, batchCounter);
     } else {
@@ -399,6 +463,30 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       getCatGrid(batchCounter, catGrid);
     }
 #endif
+    int32_t trailingSize;
+    TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
+    if (isInOutAligned) {
+      // in this case we can and should flatten the tensors after the cat dim
+      // we want to view the tensors as if consisting of `alignment`-sized elements
+      // however, we might not be able to cleanly divide just the last dim -
+      // it might not be the multiple of alignment.
+      // however, we know that the full concatted slice is multiple of alignment,
+      // so if we flatten all the dims after and including concat dim,
+      // it will be divisible by alignment
+      // then we need to divide last out size by elems_per_vec,
+      // and divide all strides except last by elems_per_vec (last stride is 1 always)
+      // for input, we will fix up the sizes and strides in the kernel directly
+      kernelOutputParam = outputParam;
+      nDims = dimension + 1;
+      constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
+      auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
+      kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
+      trailingSize = outputParam.tensorStride[dimension];
+      kernelOutputParam.tensorStride[dimension] = 1;
+      for (int i = 0; i < dimension; ++i) {
+        kernelOutputParam.tensorStride[i] /= elems_per_vec;
+      }
+    }
 
     if (memory_format != c10::MemoryFormat::Contiguous) {
       switch (dimension) {
@@ -413,7 +501,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
     }
     // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
-    if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
+    if (isInOutAligned) {\
+      constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
+      CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
+      catGrid, applyBlock, 0, stream.stream()>>>(\
+        (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
+    } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
       CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
           catGrid, applyBlock, 0, stream.stream()>>>(\
               data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp
index e2d5ecef006a5..46ef404ebea64 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@@ -221,22 +221,9 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
   std::optional<CuFFTConfig> uncached_plan;
   const CuFFTConfig * config = nullptr;
 
-  // Workaround for gh-63152, gh-58724
-  // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
   // Bluestein's algorithm is only used when a size has large prime factors,
   // sizes with only small prime factors can still be cached
-  bool use_caching = true;
-#ifdef CUFFT_VERSION
-  if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
-    // Only cache plans for transforms with small prime factors
-    use_caching = std::none_of(
-        signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
-      return has_large_prime_factor(dim_size);
-    });
-  }
-#endif
-
-  if (use_caching && plan_cache.max_size() > 0) {
+  if (plan_cache.max_size() > 0) {
     guard.lock();
     if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
       config = &plan_cache.lookup(Params);
diff --git a/aten/src/ATen/native/cuda/int8mm.cu b/aten/src/ATen/native/cuda/int8mm.cu
index 60f64cd9fc203..c2c4b04455b51 100644
--- a/aten/src/ATen/native/cuda/int8mm.cu
+++ b/aten/src/ATen/native/cuda/int8mm.cu
@@ -5,12 +5,20 @@
 
 namespace at::native {
 
-__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
+__global__ void weight_int8pack_mm_kernel(
+    const float* x,
+    const int8_t* w,
+    const float* scale,
+    float* out,
+    int B,
+    int K,
+    int N) {
   // one thread per output element: [B, N]
   int b = blockIdx.y * blockDim.y + threadIdx.y;
   int n = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if (b >= B || n >= N) return;
+  if (b >= B || n >= N)
+    return;
 
   float acc = 0.0f;
   for (int k = 0; k < K; ++k) {
@@ -20,7 +28,11 @@ __global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const
   out[b * N + n] = acc * scale[n];
 }
 
-void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
+void launch_weight_int8pack_mm_cuda_kernel(
+    const Tensor& x,
+    const Tensor& w_int8,
+    const Tensor& scale,
+    Tensor& out) {
   const int B = x.size(0);
   const int K = x.size(1);
   const int N = w_int8.size(0);
@@ -35,12 +47,16 @@ void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8
       w_int8.data_ptr<int8_t>(),
       scale.data_ptr<float>(),
       out.data_ptr<float>(),
-      B, K, N);
+      B,
+      K,
+      N);
 }
 
-
 // Main GPU entry point
-at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
+at::Tensor _weight_int8pack_mm_cuda(
+    const at::Tensor& x,
+    const at::Tensor& w_int8,
+    const at::Tensor& scale) {
   // --- Check inputs ---
   TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
   TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
@@ -50,12 +66,16 @@ at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int
   TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
   TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
 
-  TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
-  TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
+  TORCH_CHECK(
+      x.size(1) == w_int8.size(1),
+      "K dimension mismatch: x.size(1) != w.size(1)");
+  TORCH_CHECK(
+      w_int8.size(0) == scale.size(0),
+      "Output dim mismatch: w.size(0) != scale.size(0)");
 
   // --- Determine shapes ---
-  auto B = x.size(0);  // batch size
-  auto N = w_int8.size(0);  // output dim
+  auto B = x.size(0); // batch size
+  auto N = w_int8.size(0); // output dim
 
   // Ensure inputs are in the correct types for the kernel
   auto x_f32 = x.to(at::kFloat);
@@ -63,12 +83,13 @@ at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int
   auto scale_f32 = scale.to(at::kFloat);
 
   // --- Allocate output ---
-  auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
+  auto out = at::empty({B, N}, x_f32.options());
 
   // --- Launch kernel ---
-  launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
+  launch_weight_int8pack_mm_cuda_kernel(
+      x_f32, w_int8_contiguous, scale_f32, out);
 
-  return out;
+  return out.to(x.dtype());
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp
index c2f7ce2ac2d53..1658ce34ca6c5 100644
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@@ -482,7 +482,9 @@ auto build_graph(
   auto scaled_dot_product_flash_attention_options =
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA")
-          .set_generate_stats(return_softmaxstats)
+          .set_is_inference(return_softmaxstats == false)
+          // TODO(eqy): switch to this API once cuDNN FE is upgraded
+          // .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale);
   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@@ -702,7 +704,9 @@ auto build_graph_nestedtensor(
   auto scaled_dot_product_flash_attention_options =
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA_NESTEDTENSOR")
-          .set_generate_stats(return_softmaxstats)
+          .set_is_inference(return_softmaxstats == false)
+          // TODO(eqy): switch to this API once cuDNN FE is upgraded
+          // .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale)
           .set_seq_len_q(SEQ_LEN_Q_)
diff --git a/aten/src/ATen/native/im2col_shape_check.h b/aten/src/ATen/native/im2col_shape_check.h
index 6c830c5c929cb..710954f7a022b 100644
--- a/aten/src/ATen/native/im2col_shape_check.h
+++ b/aten/src/ATen/native/im2col_shape_check.h
@@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/div_rtn.h>
+#include <c10/util/safe_numerics.h>
 
 namespace at::native {
 
@@ -54,6 +55,14 @@ inline void col2im_shape_check(
 
   int64_t batch_dim = (ndim == 3) ? 0 : -1;
   int64_t n_input_plane = input.size(batch_dim + 1);
+  uint64_t prod_kernel_size = 1;
+
+  TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
+            "Given kernel_width = ",
+            kernel_width,
+            " and kernel_height = ",
+            kernel_height,
+            " the product of kernel_width and kernel_height overflowed.");
 
   if (n_input_plane % (kernel_width * kernel_height) != 0) {
     TORCH_CHECK(false,
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index f21325cd0848f..0c122c9e13d4d 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -7,6 +7,7 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
 #include <ATen/ops/miopen_batch_norm_native.h>
 #include <ATen/ops/miopen_batch_norm_backward_native.h>
 #endif
@@ -102,7 +103,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     mode = miopenBNSpatial;
   }
 
-  auto output_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
+  auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format());
   TensorArg output{ output_t, "output", 0 };
 
   auto handle = getMiopenHandle();
@@ -170,22 +171,15 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     const std::optional<Tensor>& save_var_t_opt,
     double epsilon) {
   // See [Note: hacky wrapper removal for optional tensor]
-  const Tensor& running_mean =
-      running_mean_opt.value_or(Tensor());
-  const Tensor& running_var =
-      running_var_opt.value_or(Tensor());
-  const Tensor& save_mean_t =
-      save_mean_t_opt.value_or(Tensor());
-  const Tensor& save_var_t =
-      save_var_t_opt.value_or(Tensor());
+  const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor());
+  const Tensor& save_var_t = save_var_t_opt.value_or(Tensor());
 
   auto grad_output_contig =
       grad_output_t.contiguous(input_t.suggest_memory_format());
-  TensorArg input{ input_t, "input", 1 },
-            grad_output{ grad_output_contig, "grad_output", 2 },
-            weight{ weight_t, "weight", 3 },
-            save_mean{ save_mean_t, "save_mean", 4 },
-            save_var{ save_var_t, "save_var", 5 };
+  TensorArg input{input_t, "input", 1},
+      grad_output{grad_output_contig, "grad_output", 2},
+      weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4},
+      save_var{save_var_t, "save_var", 5};
   CheckedFrom c = "miopen_batch_norm_backward";
 
   checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
@@ -197,7 +191,9 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
   }
   checkAllSameType(c, {input, grad_output});
   checkAllSameType(c, {weight, save_mean, save_var});
+  // TODO: is weight required to be contiguous?
   checkAllContiguous(c, {save_mean, save_var});
+  // TODO: TensorArg check should start handle memory format
   TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
   TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
   checkDimRange(c, input, 2, 6 /* exclusive */);
@@ -214,8 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     mode = miopenBNSpatial;
   }
 
-  auto grad_input_t = at::empty(
-      input->sizes(), input->options(), input->suggest_memory_format());
+  auto grad_input_t  = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
   auto grad_weight_t = at::empty(weight->sizes(), weight->options());
   auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
 
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 41226680c4b58..328daffa40861 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -1770,10 +1770,12 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back
 // fusions
 // ---------------------------------------------------------------------
 
-void raw_miopen_convolution_relu_out(
+void raw_miopen_convolution_add_relu_out(
     const Tensor& output,
     const Tensor& input,
     const Tensor& weight,
+    const Tensor& z,
+    float alpha,
     const Tensor& bias,
     IntArrayRef stride,
     IntArrayRef padding,
@@ -1781,68 +1783,20 @@ void raw_miopen_convolution_relu_out(
     int64_t groups,
     bool benchmark,
     bool deterministic) {
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenConvolution;
-  ConvolutionArgs args{ input, output, weight };
-  args.handle = getMiopenHandle();
-  at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
-  setConvolutionParams(
-      &args.params,
-      args.handle,
+  raw_miopen_convolution_forward_out(
+      output,
       input,
       weight,
       padding,
       stride,
       dilation,
       groups,
-      deterministic,
-      memory_format);
-  args.idesc.set(input, memory_format);
-  args.wdesc.set(weight, memory_format, 0);
-  args.odesc.set(output, memory_format);
-  args.cdesc.set(
-      dataType,
-      c_mode,
-      input.dim() - 2,
-      args.params.padding,
-      args.params.stride,
-      args.params.dilation,
-      args.params.groups,
       benchmark,
       deterministic);
-
-  TensorDescriptor bdesc;
-  bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
-
-  // Create the fusion plan
-  miopenFusionPlanDescriptor_t fusePlanDesc;
-  miopenFusionOpDescriptor_t convoOp;
-  miopenFusionOpDescriptor_t biasOp;
-  miopenFusionOpDescriptor_t activOp;
-  MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc()));
-  MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc()));
-  MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc()));
-  MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU));
-
-  // compile fusion plan
-  MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc));
-
-  // Set the Args
-  float alpha = static_cast<float>(1);
-  float beta = static_cast<float>(0);
-  float activ_alpha = static_cast<float>(0);
-  float activ_beta = static_cast<float>(0);
-  float activ_gamma = static_cast<float>(0);
-  miopenOperatorArgs_t fusionArgs;
-  MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
-  MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
-  MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
-  MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
-
-  miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
-
-  // Cleanup
-  miopenDestroyFusionPlan(fusePlanDesc);
+  at::Tensor alpha_mul_z_add_bias =
+      at::native::reshape_bias(input.dim(), bias).add(z, alpha);
+  output.add_(alpha_mul_z_add_bias);
+  output.relu_();
 }
 
 static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) {
@@ -1855,171 +1809,107 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
 Tensor miopen_convolution_add_relu(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const Tensor& z,
+    const Tensor& z_t,
     const std::optional<Scalar>& alpha,
-    const std::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias_t,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
     int64_t groups) {
-
-  // MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
-  // y = act ( alpha1 * conv(x) + alpha2 * z + bias )
-
   auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+  const Tensor input = input_t.contiguous(memory_format);
+  const Tensor weight = weight_t.contiguous(memory_format);
+  Tensor z = z_t;
+  if (z.suggest_memory_format() != memory_format) {
+    z = z.to(memory_format);
+  }
+  z = z.contiguous(memory_format);
 
-  auto& ctx = at::globalContext();
-  bool benchmark = ctx.benchmarkCuDNN();
-
-  TensorArg input  { input_t,  "input",  1 },
-            weight { weight_t, "weight", 2 };
-
+  // FuseFrozenConvAddRelu performs some tensor shape checking
   Tensor output_t = at::detail::empty_cuda(
       conv_output_size(
-        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
-      input_t.options().memory_format(memory_format));
-  if (output_t.numel() == 0){
+          input.sizes(), weight.sizes(), padding, stride, dilation),
+      input.options().memory_format(memory_format));
+  if (output_t.numel() == 0) {
     return output_t;
   }
-  // Avoid ambiguity of "output" when this is being used as backwards
-  TensorArg output{output_t, "result", 0};
-  miopen_convolution_forward_out(
-      output,
-      "miopen_convolution_add_relu",
+
+  auto& ctx = at::globalContext();
+  bool benchmark = ctx.benchmarkCuDNN();
+  auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
+  auto _bias = bias_t.has_value()
+      ? bias_t.value()
+      : at::zeros(
+            {output_t.size(1)},
+            optTypeMetaToScalarType(output_t.options().dtype_opt()),
+            output_t.options().layout_opt(),
+            output_t.options().device_opt(),
+            output_t.options().pinned_memory_opt());
+
+  raw_miopen_convolution_add_relu_out(
+      output_t,
       input,
       weight,
-      padding,
+      z,
+      _alpha,
+      _bias,
       stride,
+      padding,
       dilation,
       groups,
       benchmark,
-      false // deterministic
-  );
-
-  auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
+      true); // deterministic
 
-  if (!output_t.is_same(contig_output_t)) {
-    contig_output_t.copy_(output_t);
-  }
-
-  auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
-  auto _bias = bias.has_value()
-          ? bias.value()
-          : at::zeros(
-                {contig_output_t.size(1)},
-                optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
-                contig_output_t.options().layout_opt(),
-                contig_output_t.options().device_opt(),
-                contig_output_t.options().pinned_memory_opt());
-
-  at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
-  contig_output_t.add_(alpha_mul_z_add_bias);
-  contig_output_t.relu_();
-
-  return contig_output_t;
+  return output_t;
 }
 
 Tensor miopen_convolution_relu(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const std::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias_t,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
     int64_t groups) {
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+  const Tensor input = input_t.contiguous(memory_format);
+  const Tensor weight = weight_t.contiguous(memory_format);
 
-  auto& ctx = at::globalContext();
-  bool benchmark = ctx.benchmarkCuDNN();
-
-  // MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
-  if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
-          && input_t.scalar_type() == at::kFloat
-          && input_t.ndimension() == 4) {
-
-    // FuseFrozenConvAddRelu performs some tensor shape checking
-    Tensor output_t = at::detail::empty_cuda(
-        conv_output_size(
-            input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
-        input_t.options().memory_format(input_t.suggest_memory_format()));
-    if (output_t.numel() == 0) {
-      return output_t;
-    }
-
-    auto _bias = bias.has_value()
-            ? bias.value()
-            : at::zeros(
-                  {output_t.size(1)},
-                  optTypeMetaToScalarType(output_t.options().dtype_opt()),
-                  output_t.options().layout_opt(),
-                  output_t.options().device_opt(),
-                  output_t.options().pinned_memory_opt());
-
-    raw_miopen_convolution_relu_out(
-        output_t,
-        input_t,
-        weight_t,
-        _bias,
-        stride,
-        padding,
-        dilation,
-        groups,
-        benchmark, // benchmark
-        false // deterministic
-    );
-
+  // FuseFrozenConvAddRelu performs some tensor shape checking
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+          input.sizes(), weight.sizes(), padding, stride, dilation),
+      input.options().memory_format(memory_format));
+  if (output_t.numel() == 0) {
     return output_t;
   }
-  else {
-    // fallback
-
-    auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
-
-    TensorArg input  { input_t,  "input",  1 },
-              weight { weight_t, "weight", 2 };
 
-    Tensor output_t = at::detail::empty_cuda(
-        conv_output_size(
-          input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
-        input->options().memory_format(memory_format));
-    if (output_t.numel() == 0){
-      return output_t;
-    }
-    // Avoid ambiguity of "output" when this is being used as backwards
-    TensorArg output{output_t, "result", 0};
-    miopen_convolution_forward_out(
-        output,
-        "miopen_convolution_relu",
-        input,
-        weight,
-        padding,
-        stride,
-        dilation,
-        groups,
-        benchmark,
-        false // deterministic
-    );
-
-    auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
-
-    if (!output_t.is_same(contig_output_t)) {
-      contig_output_t.copy_(output_t);
-    }
-
-    auto _bias = bias.has_value()
-            ? bias.value()
-            : at::zeros(
-                  {contig_output_t.size(1)},
-                  optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
-                  contig_output_t.options().layout_opt(),
-                  contig_output_t.options().device_opt(),
-                  contig_output_t.options().pinned_memory_opt());
-
-    at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
-    contig_output_t.add_(reshaped_bias);
-    contig_output_t.relu_();
+  auto& ctx = at::globalContext();
+  bool benchmark = ctx.benchmarkCuDNN();
+  auto _bias = bias_t.has_value()
+      ? bias_t.value()
+      : at::zeros(
+            {output_t.size(1)},
+            optTypeMetaToScalarType(output_t.options().dtype_opt()),
+            output_t.options().layout_opt(),
+            output_t.options().device_opt(),
+            output_t.options().pinned_memory_opt());
+
+  raw_miopen_convolution_add_relu_out(
+      output_t,
+      input,
+      weight,
+      output_t, // use output_t as z to satisfy MIOpen API
+      0, // alpha
+      _bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      benchmark, // benchmark
+      true); // deterministic
 
-    return contig_output_t;
-  }
+  return output_t;
 }
 
 REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward)
diff --git a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
index 6a66abc7b062f..7ef9aa5689d53 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
@@ -559,4 +559,60 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
       at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
   return _int_mm_out_xpu(self, mat2, result);
 }
+
+Tensor _weight_int8pack_mm_xpu(
+    const Tensor& A,
+    const Tensor& B,
+    const Tensor& scales) {
+  auto M = A.size(0);
+  auto N = B.size(0);
+  auto K = A.size(1);
+
+  TORCH_CHECK(
+      A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+      " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
+  TORCH_CHECK(
+      A.stride(1) == 1, " : A must be contiguous on the last dimension.");
+  TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
+  TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
+  TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
+
+  TORCH_CHECK(
+      scales.dim() == 1 && scales.size(0) == N,
+      " : expect scales to be 1d tensor with size ",
+      N);
+
+  auto C = at::empty({M, N}, A.options());
+
+  // --- Launch kernel ---
+  Tensor bias = at::Tensor();
+  Tensor mat2_zero_points = at::Tensor();
+  Tensor non_const_scales = scales;
+  auto post_op_args = torch::List<std::optional<at::Scalar>>();
+
+  at::native::onednn::quantized_matmul(
+      A.contiguous(),
+      1.0,
+      0,
+      B,
+      non_const_scales,
+      mat2_zero_points,
+      bias,
+      C,
+      1.0,
+      0,
+      C.scalar_type(),
+      /*other*/ std::nullopt,
+      /*other scale*/ 1.0,
+      /*other zp*/ 0,
+      /*binary post op*/ "none",
+      /*binary alpha*/ 1.0,
+      /*post_op_name*/ "none",
+      post_op_args,
+      /*post_op_algorithm*/ "none",
+      /*m2_trans*/ false);
+
+  return C;
+}
 } // namespace at::native
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
index 41da31c7eb6b2..ede01093ff3e7 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@@ -110,8 +110,9 @@ void quantized_matmul(
   // [Note] Quantized Matrix Multiplication at XPU
   // The following code integrates oneDNN quantized gemm. The quantization
   // config we support:
-  // activation: s8&u8; per tensor calibrated; symmetric&asymmetric
-  // weight: s8; per_tensor/per_channel calibrated; symmetric
+  // activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
+  // symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
+  // symmetric
   auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
   construct_attr_by_post_op(
       binary_post_op,
diff --git a/aten/src/ATen/native/mps/MPSGraphSonomaOps.h b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
deleted file mode 100644
index 6290245083a44..0000000000000
--- a/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#pragma once
-
-#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
-
-#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
-
-typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
-  MPSGraphFFTScalingModeNone = 0L,
-  MPSGraphFFTScalingModeSize = 1L,
-  MPSGraphFFTScalingModeUnitary = 2L,
-};
-
-@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
-@property(readwrite, nonatomic) BOOL inverse;
-@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
-@property(readwrite, nonatomic) BOOL roundToOddHermitean;
-+ (nullable instancetype)descriptor;
-@end
-
-@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
-
-@interface MPSGraph (SonomaOps)
-- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                                      axes:(NSArray<NSNumber*>* _Nonnull)axes
-                                                descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
-                                                      name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
-                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
-                                                    name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
-                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
-                                                    name:(NSString* _Nullable)name;
-@end
-
-// define BFloat16 enums for MacOS13
-#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
-
-// define Metal version
-#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
-#endif
diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
deleted file mode 100644
index 5497c83f7b9a6..0000000000000
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ /dev/null
@@ -1,196 +0,0 @@
-#pragma once
-#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
-
-// TODO: Remove me when moved to MacOS 13
-#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
-
-@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
-
-@property(readwrite, nonatomic) NSUInteger strideInX;
-@property(readwrite, nonatomic) NSUInteger strideInY;
-@property(readwrite, nonatomic) NSUInteger strideInZ;
-@property(readwrite, nonatomic) NSUInteger dilationRateInX;
-@property(readwrite, nonatomic) NSUInteger dilationRateInY;
-@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
-
-@property(readwrite, nonatomic) NSUInteger paddingLeft;
-@property(readwrite, nonatomic) NSUInteger paddingRight;
-@property(readwrite, nonatomic) NSUInteger paddingTop;
-@property(readwrite, nonatomic) NSUInteger paddingBottom;
-@property(readwrite, nonatomic) NSUInteger paddingFront;
-@property(readwrite, nonatomic) NSUInteger paddingBack;
-
-@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
-@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
-@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
-
-@property(readwrite, nonatomic) NSUInteger groups;
-
-@end
-
-@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
-
-#endif
-
-@interface MPSGraph (VenturaOps)
-
-#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
-
-typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
-  MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
-  MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
-  MPSGraphResizeNearestRoundingModeCeil = 2L,
-  MPSGraphResizeNearestRoundingModeFloor = 3L,
-  MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
-  MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
-};
-
-// Define complex enums for MacOS 12
-#define MPSDataTypeComplexBit 0x01000000
-#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
-#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
-#endif
-
-- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
-                                            weightsTensor:(MPSGraphTensor* _Nonnull)weights
-                                               descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
-                                                     name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)
-    convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
-                                          weightsTensor:(MPSGraphTensor* _Nonnull)weights
-                                            outputShape:(MPSShape* _Nonnull)outputShape
-                           forwardConvolutionDescriptor:
-                               (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
-                                                   name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)
-    convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
-                                              sourceTensor:(MPSGraphTensor* _Nonnull)source
-                                               outputShape:(MPSShape* _Nonnull)outputShape
-                              forwardConvolutionDescriptor:
-                                  (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
-                                                      name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                               axis:(NSInteger)axis
-                                               name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                      axis:(NSInteger)axis
-                                      name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                      axis:(NSInteger)axis
-                                descending:(BOOL)descending
-                                      name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                descending:(BOOL)descending
-                                      name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                      name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                         axis:(NSInteger)axis
-                                         name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                         axis:(NSInteger)axis
-                                   descending:(BOOL)descending
-                                         name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                   descending:(BOOL)descending
-                                         name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                         name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                       centerResult:(BOOL)centerResult
-                                       alignCorners:(BOOL)alignCorners
-                                             layout:(MPSGraphTensorNamedDataLayout)layout
-                                               name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                  scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                             layout:(MPSGraphTensorNamedDataLayout)layout
-                                               name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                        centerResult:(BOOL)centerResult
-                                        alignCorners:(BOOL)alignCorners
-                                              layout:(MPSGraphTensorNamedDataLayout)layout
-                                                name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                   scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                              layout:(MPSGraphTensorNamedDataLayout)layout
-                                                name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                      input:(MPSGraphTensor* _Nonnull)input
-                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                               centerResult:(BOOL)centerResult
-                                               alignCorners:(BOOL)alignCorners
-                                                     layout:(MPSGraphTensorNamedDataLayout)layout
-                                                       name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                      input:(MPSGraphTensor* _Nonnull)input
-                                          scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                                     layout:(MPSGraphTensorNamedDataLayout)layout
-                                                       name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                       input:(MPSGraphTensor* _Nonnull)input
-                                                centerResult:(BOOL)centerResult
-                                                alignCorners:(BOOL)alignCorners
-                                                      layout:(MPSGraphTensorNamedDataLayout)layout
-                                                        name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                       input:(MPSGraphTensor* _Nonnull)input
-                                           scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                                      layout:(MPSGraphTensorNamedDataLayout)layout
-                                                        name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
-                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
-                                                layout:(MPSGraphTensorNamedDataLayout)layout
-                                  normalizeCoordinates:(BOOL)normalizeCoordinates
-                                   relativeCoordinates:(BOOL)relativeCoordinates
-                                          alignCorners:(BOOL)alignCorners
-                                           paddingMode:(MPSGraphPaddingMode)paddingMode
-                                          samplingMode:(MPSGraphResizeMode)samplingMode
-                                         constantValue:(double)constantValue
-                                                  name:(NSString* _Nullable)name;
-
-- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
-                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
-                                                layout:(MPSGraphTensorNamedDataLayout)layout
-                                  normalizeCoordinates:(BOOL)normalizeCoordinates
-                                   relativeCoordinates:(BOOL)relativeCoordinates
-                                          alignCorners:(BOOL)alignCorners
-                                           paddingMode:(MPSGraphPaddingMode)paddingMode
-                                   nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                         constantValue:(double)constantValue
-                                                  name:(NSString* _Nullable)name;
-- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
-
-@end
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index bf3e94207e25b..ef42ea6de48c6 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -9,8 +9,6 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
@@ -570,7 +568,7 @@ static void check_mps_shape(MPSShape* shape) {
     MPSShape* mpsStrides = getMPSShape(_tensor.strides());
     check_mps_shape(mpsShape);
 
-    auto storage_numel = src.storage().nbytes() / src.element_size();
+    auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset();
     TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
                 "MPSGaph does not support tensor dims larger than INT_MAX");
     MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType
diff --git a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
new file mode 100644
index 0000000000000..6b2e702d377b3
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <c10/metal/common.h>
+
+#ifdef __METAL__
+enum class EmbeddingBagMode { SUM = 0, MEAN, MAX };
+#else
+#include <ATen/native/EmbeddingBag.h>
+using at::native::EmbeddingBagMode;
+#endif
+
+template <typename idx_type_t = uint32_t>
+struct EmbeddingBagParams {
+  ::c10::metal::array<idx_type_t, 2> weight_strides;
+  ::c10::metal::array<idx_type_t, 2> output_strides;
+  ::c10::metal::array<idx_type_t, 2> max_indices_strides;
+
+  idx_type_t per_sample_weights_stride;
+
+  idx_type_t num_indices;
+  idx_type_t num_bags;
+  idx_type_t feature_size;
+
+  EmbeddingBagMode mode;
+  int64_t padding_idx;
+};
diff --git a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
new file mode 100644
index 0000000000000..28f6aa09897a1
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@@ -0,0 +1,261 @@
+#include <ATen/native/mps/kernels/EmbeddingBag.h>
+#include <c10/metal/utils.h>
+#include <metal_array>
+#include <metal_stdlib>
+
+using namespace metal;
+using namespace c10::metal;
+
+template <EmbeddingBagMode M, typename T>
+struct ReductionOpInit {
+  inline opmath_t<T> operator()() {
+    return 0;
+  }
+};
+
+template <typename T>
+struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
+  inline opmath_t<T> operator()() {
+    return static_cast<opmath_t<T>>(-INFINITY);
+  }
+};
+
+template <EmbeddingBagMode M, typename T>
+struct ReductionOp {
+  inline opmath_t<T> operator()(
+      opmath_t<T> weight_val,
+      opmath_t<T> out_val,
+      bool is_first) {
+    return weight_val + out_val;
+  }
+};
+
+template <typename T>
+struct ReductionOp<EmbeddingBagMode::MAX, T> {
+  inline opmath_t<T> operator()(
+      opmath_t<T> weight_val,
+      opmath_t<T> out_val,
+      bool is_first) {
+    return (is_first || weight_val > out_val) ? weight_val : out_val;
+  }
+};
+
+template <EmbeddingBagMode M, typename T>
+struct MaybeApplyPerSampleWeight {
+  inline opmath_t<T> operator()(
+      opmath_t<T> weight_val,
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_stride) {
+    return weight_val;
+  }
+};
+
+template <typename T>
+struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
+  inline opmath_t<T> operator()(
+      opmath_t<T> weight_val,
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_stride) {
+    if (per_sample_weights_stride) {
+      T per_sample_weight = per_sample_weights
+          [per_sample_weights_stride * per_sample_weights_index];
+      return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
+    } else {
+      return weight_val;
+    }
+  }
+};
+
+template <EmbeddingBagMode M, typename T, typename I>
+struct MaybeCalcMaxIndex {
+  inline void operator()(
+      opmath_t<T> weight_val,
+      opmath_t<T> out_val,
+      bool is_first,
+      thread I& max_idx,
+      I weight_idx,
+      bool pad) {}
+};
+
+template <typename T, typename I>
+struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> {
+  inline void operator()(
+      opmath_t<T> weight_val,
+      opmath_t<T> out_val,
+      bool is_first,
+      thread I& max_idx,
+      I weight_idx,
+      bool pad) {
+    max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx;
+  }
+};
+
+template <EmbeddingBagMode M, typename T>
+struct ReductionOpFinal {
+  inline T operator()(opmath_t<T> val, uint32_t) {
+    return static_cast<T>(val);
+  }
+};
+
+template <typename T>
+struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> {
+  inline T operator()(opmath_t<T> val, uint32_t count) {
+    auto out = val / count;
+    return static_cast<T>((count == 0) ? 0 : out);
+  }
+};
+
+template <typename T>
+struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
+  inline T operator()(opmath_t<T> val, uint32_t count) {
+    return static_cast<T>((count == 0) ? 0 : val);
+  }
+};
+
+template <EmbeddingBagMode M, typename I>
+struct MaybeWriteMaxIndex {
+  inline void operator()(
+      device I*,
+      const constant ::c10::metal::array<uint32_t, 2>&,
+      uint32_t,
+      uint32_t,
+      I) {}
+};
+
+template <typename I>
+struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> {
+  inline void operator()(
+      device I* max_indices,
+      const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides,
+      uint32_t bag_idx,
+      uint32_t feature_idx,
+      I max_idx) {
+    max_indices
+        [bag_idx * max_indices_strides[0] +
+         feature_idx * max_indices_strides[1]] = max_idx;
+  }
+};
+
+template <EmbeddingBagMode M, typename T, typename I>
+void embedding_bag_impl(
+    constant T* weight,
+    constant I* indices,
+    constant I* offsets,
+    constant T* per_sample_weights,
+    device T* output,
+    device I* offset2bag,
+    device I* bag_size,
+    device I* max_indices,
+    constant EmbeddingBagParams<uint32_t>& params,
+    uint tid) {
+  auto num_indices = params.num_indices;
+  auto num_bags = params.num_bags;
+  auto feature_size = params.feature_size;
+  auto padding_idx = params.padding_idx;
+  auto per_sample_weights_stride = params.per_sample_weights_stride;
+  constant auto& output_strides = params.output_strides;
+  constant auto& weight_strides = params.weight_strides;
+  constant auto& max_indices_strides = params.max_indices_strides;
+
+  auto bag_idx = tid / feature_size;
+  auto feature_idx = tid % feature_size;
+
+  uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
+  bool is_last_bag = bag_idx + 1 == num_bags;
+  uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
+  uint32_t indices_end = is_last_bag * (num_indices) +
+      (!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end]));
+
+  auto out_val = ReductionOpInit<M, T>()();
+
+  uint32_t bag_size_ = 0;
+  I max_idx = 0;
+
+  for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
+       indices_idx++) {
+    I weight_idx = indices[indices_idx];
+    bool pad = (weight_idx == padding_idx);
+    auto weight_val = static_cast<opmath_t<T>>(
+        weight
+            [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+             feature_idx * weight_strides[1]]);
+
+    weight_val = MaybeApplyPerSampleWeight<M, T>()(
+        weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
+
+    auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
+
+    MaybeCalcMaxIndex<M, T, I>()(
+        weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad);
+
+    out_val = pad ? out_val : new_out_val;
+    offset2bag[indices_idx] = bag_idx;
+    bag_size_ += static_cast<uint32_t>(!pad);
+  }
+
+  output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] =
+      ReductionOpFinal<M, T>()(out_val, bag_size_);
+
+  bag_size[bag_idx] = bag_size_;
+
+  MaybeWriteMaxIndex<M, I>()(
+      max_indices, max_indices_strides, bag_idx, feature_idx, max_idx);
+}
+
+#define DISPATCH_IMPL(MODE)        \
+  return embedding_bag_impl<MODE>( \
+      weight,                      \
+      indices,                     \
+      offsets,                     \
+      per_sample_weights,          \
+      output,                      \
+      offset2bag,                  \
+      bag_size,                    \
+      max_indices,                 \
+      params,                      \
+      tid)
+
+template <typename T, typename I>
+kernel void embedding_bag(
+    constant T* weight [[buffer(0)]],
+    constant I* indices [[buffer(1)]],
+    constant I* offsets [[buffer(2)]],
+    constant T* per_sample_weights [[buffer(3)]],
+    device T* output [[buffer(4)]],
+    device I* offset2bag [[buffer(5)]],
+    device I* bag_size [[buffer(6)]],
+    device I* max_indices [[buffer(7)]],
+    constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]],
+    uint tid [[thread_position_in_grid]]) {
+  switch (params.mode) {
+    case EmbeddingBagMode::SUM:
+      DISPATCH_IMPL(EmbeddingBagMode::SUM);
+    case EmbeddingBagMode::MEAN:
+      DISPATCH_IMPL(EmbeddingBagMode::MEAN);
+    case EmbeddingBagMode::MAX:
+      DISPATCH_IMPL(EmbeddingBagMode::MAX);
+  }
+}
+
+#define REGISTER_EMBEDDING_BAG_OP(T, I)                             \
+  template [[host_name("embedding_bag_" #T "_" #I)]]                \
+  kernel void embedding_bag<T, I>(                                  \
+      constant T * weight [[buffer(0)]],                            \
+      constant I * indices [[buffer(1)]],                           \
+      constant I * offsets [[buffer(2)]],                           \
+      constant T * per_sample_weights [[buffer(3)]],                \
+      device T * output [[buffer(4)]],                              \
+      device I * offset2bag [[buffer(5)]],                          \
+      device I * bag_size [[buffer(6)]],                            \
+      device I * max_indices [[buffer(7)]],                         \
+      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
+      uint tid [[thread_position_in_grid]]);
+
+REGISTER_EMBEDDING_BAG_OP(float, int);
+REGISTER_EMBEDDING_BAG_OP(float, long);
+REGISTER_EMBEDDING_BAG_OP(half, int);
+REGISTER_EMBEDDING_BAG_OP(half, long);
+REGISTER_EMBEDDING_BAG_OP(bfloat, int);
+REGISTER_EMBEDDING_BAG_OP(bfloat, long);
diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
index 0b303f48028f4..32b0fff8081ee 100644
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@@ -8,8 +8,6 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/operations/BinaryKernel.h>
-// For MTLLanguageVersion_3_1
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <fmt/format.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index d572d52d103a1..0688d294f6b29 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -1,23 +1,12 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/ConvUtils.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_mps_convolution_native.h>
 #include <ATen/ops/_mps_convolution_transpose_native.h>
 #include <ATen/ops/mps_convolution_backward_native.h>
 #include <ATen/ops/mps_convolution_transpose_backward_native.h>
-
-#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
-
-@implementation FakeMPSGraphConvolution3DOpDescriptor
-- (nonnull id)copyWithZone:(nullable NSZone*)zone {
-  return self;
-}
-
-@end
-
-#endif
+#include <fmt/format.h>
 
 namespace at::native {
 
@@ -50,11 +39,9 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
   descriptor_.paddingFront = paddingDepth;
   descriptor_.paddingBack = paddingDepth;
 
-  // PyTorch always uses NCDHW memory layout for 3D tensors
-  descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
+  descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;
 
-  // PyTorch always uses OIDHW memory layout for 3D weights
-  descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
+  descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;
 
   descriptor_.groups = groups; // not yet tested in Xcode/C++
 }
@@ -186,18 +173,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
     if (bias_defined)
       bias_shape = bias_opt.value().sizes();
 
-    std::string mem_format_key;
-    switch (memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
     std::string bias_shape_key;
     if (bias_defined) {
       bias_shape_key = std::to_string(bias_shape[0]);
@@ -205,20 +180,16 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
       bias_shape_key = "nobias";
     }
 
-    std::string key;
-    if (is3DConv) {
-      key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
-          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
-          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
-
-    } else {
-      key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
-          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
-    }
+    std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
+                                  is3DConv ? "3d_" : "",
+                                  getArrayRefString(stride),
+                                  getArrayRefString(dilation),
+                                  getArrayRefString(padding),
+                                  groups,
+                                  is_channels_last,
+                                  mps::getTensorsStringKey({input_t, weight_t}),
+                                  bias_defined,
+                                  bias_shape_key);
 
     MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
     MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
@@ -227,7 +198,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
 
     if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
       inputNDArray = getMPSNDArray(input_t, inputShape);
-      outputNDArray = getMPSNDArray(*output, outputShape);
+      outputNDArray = getMPSNDArray(output_t, outputShape);
     }
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -331,7 +302,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
       }
     }
     auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
-                                           : Placeholder(cachedGraph->outputTensor_, *output);
+                                           : Placeholder(cachedGraph->outputTensor_, output_t);
 
     NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
         [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
@@ -344,7 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
-  return *output;
+  return output_t;
 }
 
 Tensor _mps_convolution(const Tensor& input_t,
@@ -400,33 +371,15 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
   @autoreleasepool {
     MPSStream* stream = getCurrentMPSStream();
 
-    std::string mem_format_key;
-    switch (memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
     MPSShape* mps_input_shape = getMPSShape(input_size);
-    std::string key;
-    if (is3DConv) {
-      key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          ":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
-          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
-          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          getTensorsStringKey({grad_output_t, weight_t});
-
-    } else {
-      key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
-          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          getTensorsStringKey({grad_output_t, weight_t});
-    }
+    std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
+                                  is3DConv ? "3d_" : "",
+                                  getArrayRefString(stride),
+                                  getArrayRefString(dilation),
+                                  getArrayRefString(padding),
+                                  groups,
+                                  is_channels_last,
+                                  getTensorsStringKey({grad_output_t, weight_t}));
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
       auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
@@ -551,19 +504,13 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
     MPSStream* stream = getCurrentMPSStream();
 
     MPSShape* mps_weight_shape = getMPSShape(weight_size);
-    std::string key;
-    if (is3DConv) {
-      key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
-          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
-          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
-          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
-    } else {
-      key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
-          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
-          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
-    }
+    std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
+                                  is3DConv ? "3d_" : "",
+                                  getArrayRefString(stride),
+                                  getArrayRefString(dilation),
+                                  getArrayRefString(padding),
+                                  groups,
+                                  getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSShape* inputShape = getMPSShape(input_t);
       bool isDepthwiseConv =
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 0c121cee8fb62..a3cba05c975cf 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -2,7 +2,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/Copy.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_copy_from_and_resize_native.h>
 #include <ATen/ops/_copy_from_native.h>
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index 4d3f99ea9e02d..85c22c59caf96 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -5,8 +5,6 @@
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/Distributions.h>
 #include <ATen/native/TensorFactories.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
diff --git a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
new file mode 100644
index 0000000000000..b936500886efc
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@@ -0,0 +1,180 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/EmbeddingBag.h>
+#include <ATen/native/Pool.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/kernels/EmbeddingBag.h>
+
+#include <fmt/format.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_embedding_bag_forward_only_native.h>
+#include <ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::native {
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/EmbeddingBag_metallib.h>
+#endif
+
+namespace {
+
+std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) {
+  const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type());
+  return {indices.scalar_type() == commonType ? indices : indices.toType(commonType),
+          offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)};
+}
+
+} // namespace
+
+namespace mps {
+
+static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
+    const Tensor& weight,
+    const Tensor& indices_,
+    const Tensor& offsets_,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    bool sparse,
+    const std::optional<Tensor>& per_sample_weights_opt,
+    bool include_last_offset,
+    int64_t padding_idx) {
+  TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim());
+  if (indices_.dim() == 1) {
+    TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim());
+  }
+  TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim());
+
+  Tensor indices, offsets;
+  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt});
+  auto offsets_arg = TensorArg(offsets, "offsets", 1);
+  checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt});
+  checkSameType("embedding_bag_mps", indices_arg, offsets_arg);
+  auto weight_arg = TensorArg(weight, "weight", 1);
+
+  int64_t num_indices = indices.size(0);
+  int64_t num_bags = offsets.size(0);
+  if (include_last_offset) {
+    TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1");
+    num_bags -= 1;
+  }
+  int64_t feature_size = weight.size(1);
+
+  auto bag_size = at::empty({num_bags}, indices.options());
+  auto offset2bag = at::empty({indices.size(0)}, indices.options());
+  auto output = at::empty({num_bags, feature_size}, weight.options());
+
+  Tensor max_indices;
+
+  if (mode == EmbeddingBagMode::MAX) {
+    max_indices = at::empty({num_bags, feature_size}, indices.options());
+  } else {
+    max_indices = at::empty({0}, indices.options());
+  }
+
+  EmbeddingBagParams<uint32_t> params;
+
+  for (const auto dim : c10::irange(weight.dim())) {
+    params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim));
+    params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim));
+
+    if (mode == EmbeddingBagMode::MAX) {
+      params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
+    }
+  }
+
+  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
+  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+
+  params.num_indices = num_indices;
+  params.num_bags = num_bags;
+  params.feature_size = feature_size;
+  params.mode = static_cast<EmbeddingBagMode>(mode);
+  params.padding_idx = padding_idx;
+
+  auto num_threads = output.numel();
+  MPSStream* stream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      auto pipeline_state = lib.getPipelineStateForFunc(
+          fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices)));
+
+      getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets});
+      [computeEncoder setComputePipelineState:pipeline_state];
+      mtl_setArgs(computeEncoder,
+                  weight,
+                  indices,
+                  offsets,
+                  use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
+                  output,
+                  offset2bag,
+                  bag_size,
+                  max_indices,
+                  params);
+
+      mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+      getMPSProfiler().endProfileKernel(pipeline_state);
+    }
+  });
+
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(
+      std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices));
+}
+
+} // namespace mps
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight,
+                                                              const Tensor& indices,
+                                                              const Tensor& offsets,
+                                                              const bool scale_grad_by_freq,
+                                                              const int64_t mode,
+                                                              bool sparse,
+                                                              const std::optional<Tensor>& per_sample_weights_opt,
+                                                              bool include_last_offset,
+                                                              int64_t padding_idx) {
+  return mps::_embedding_bag_mps_impl(weight,
+                                      indices,
+                                      offsets,
+                                      scale_grad_by_freq,
+                                      mode,
+                                      sparse,
+                                      per_sample_weights_opt,
+                                      include_last_offset,
+                                      padding_idx);
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    bool sparse,
+    const std::optional<Tensor>& per_sample_weights_opt,
+    bool include_last_offset,
+    int64_t padding_idx) {
+  return _embedding_bag_mps(weight,
+                            indices,
+                            offsets,
+                            scale_grad_by_freq,
+                            mode,
+                            sparse,
+                            per_sample_weights_opt,
+                            include_last_offset,
+                            padding_idx);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
index 7e9867c9b948d..9a208e814cfac 100644
--- a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
+++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
@@ -1,6 +1,4 @@
 #include <ATen/native/SpectralOpsUtils.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -12,20 +10,6 @@
 #include <ATen/ops/_fft_r2c_native.h>
 #endif
 
-#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
-@implementation FakeMPSGraphFFTDescriptor
-+ (nullable instancetype)descriptor {
-  // Redispatch the constructor to the actual implementation
-  id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
-  return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
-}
-
-- (nonnull id)copyWithZone:(nullable NSZone*)zone {
-  return self;
-}
-@end
-#endif
-
 namespace at::native {
 namespace {
 MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm
index ef85633889487..92f2b9c6fbf74 100644
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@@ -2,7 +2,6 @@
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/GridSamplerUtils.h>
 #include <ATen/native/Pool.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/kernels/GridSampler.h>
 
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index fa19d2f4d127f..b759eb1373cc6 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -17,7 +17,6 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
 #include <fmt/format.h>
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index 7a3dde679c05f..2f490df8d330b 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -6,9 +6,7 @@
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
-// For MTLLanguageVersion_3_1
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -22,6 +20,7 @@
 #include <ATen/ops/baddbmm_native.h>
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
+#include <ATen/ops/eye_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
@@ -498,26 +497,24 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
   using namespace mps;
   TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
   TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
-  using CachedGraph = MPSUnaryCachedGraph;
 
-  MPSStream* stream = getCurrentMPSStream();
   info.zero_();
-
   if (A.numel() == 0) {
     return;
   }
 
-  if (!result.is_contiguous()) {
-    result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
-  }
   auto A_sizes = A.sizes();
   int ndim = A.dim();
 
-  Tensor LU = empty_like(A);
-  Tensor identity = zeros_like(A);
+  Tensor LU = empty_like(A, MemoryFormat::Contiguous);
+  Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A);
   Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
-  (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
-  linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
+  // need to do this to keep the strides of the result tensor
+  // mps's solve expects row major layout, while inductor
+  // expects result to be column major
+  Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
+  linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
+  result.copy_(tmp);
 }
 
 static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index d916320b2e238..fdfabecef06b9 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -519,6 +519,13 @@ static void max_unpool_out_mps_template(const Tensor& input,
                                         Tensor& output,
                                         const int32_t pooling_dims,
                                         const std::string& op_name) {
+  TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
+              op_name,
+              "There should be exactly ",
+              pooling_dims,
+              " elements but got ",
+              output_size_.size());
+
   auto dims = input.dim();
   auto leading_dims = input.dim() - pooling_dims;
 
@@ -534,6 +541,18 @@ static void max_unpool_out_mps_template(const Tensor& input,
   output.resize_(output_size, memory_format);
   output.fill_(0);
 
+  if (indices.defined() && indices.numel() > 0) {
+    auto output_image_size = c10::multiply_integers(output_size_);
+
+    int64_t min_idx = indices.min().item<int64_t>();
+    int64_t max_idx = indices.max().item<int64_t>();
+
+    if (min_idx < 0 || max_idx >= output_image_size) {
+      int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
+      TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
+    }
+  }
+
   id<MTLDevice> device = MPSDevice::getInstance()->device();
   MPSStream* mpsStream = getCurrentMPSStream();
   const auto numThreads = input.numel();
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index ae13504d9003e..209c757e38f6d 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -4,7 +4,6 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/ReduceOpsUtils.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <c10/util/irange.h>
 
@@ -617,6 +616,7 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
   // we allocate 1 here due to MacOS13 bug for gather MPSGraph op, look below for the error
   Tensor output_t = at::empty({1}, input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
   if (output_t.numel() == 0 || num_in_elements == 0) {
+    output_t.fill_(std::numeric_limits<float>::quiet_NaN());
     return output_t;
   }
 
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index 19f26023b3179..0e243c524377e 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -4,7 +4,6 @@
 #include <ATen/WrapDimUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm
index 6ff47044df133..b6a07f14704cc 100644
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@@ -5,7 +5,6 @@
 #include <ATen/native/SortingUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index d7ce40e5cbb4f..1a1e249b3f361 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -2,8 +2,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/mps/Copy.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index 05fb733f5c2dd..7c7683caf4286 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -1,7 +1,6 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Resize.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index addc70cf4334d..435af3ce7cf6a 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -1,7 +1,6 @@
 //  Copyright © 2023 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UpSample.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 5efd4a3cfbdf3..1d373ee1cc2d8 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -4,8 +4,6 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/Resize.h>
-// For MTLLanguageVersion_3_1
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index abb061afc5c95..120b2b73d20ec 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1414,7 +1414,7 @@
 - func: cat(Tensor[] tensors, int dim=0) -> Tensor
   structured_delegate: cat.out
   dispatch:
-    SparseCPU, SparseCUDA: cat_sparse
+    SparseCPU, SparseCUDA, SparseMPS: cat_sparse
     QuantizedCPU: cat_quantized_cpu
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
   tags: core
@@ -1798,7 +1798,7 @@
   device_guard: False
   dispatch:
     MkldnnCPU: copy_mkldnn_
-    SparseCPU, SparseCUDA: copy_sparse_wrapper_
+    SparseCPU, SparseCUDA, SparseMPS: copy_sparse_wrapper_
     CompositeExplicitAutograd: copy_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
@@ -2160,7 +2160,7 @@
   variants: function, method
   structured_delegate: div.out
   dispatch:
-    SparseCPU, SparseCUDA: div_sparse
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse
     ZeroTensor: div_zerotensor
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
   tags: [core, pointwise]
@@ -2170,7 +2170,7 @@
   variants: method
   structured_delegate: div.out
   dispatch:
-    SparseCPU, SparseCUDA: div_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse_
   tags: pointwise
 
 - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -2179,7 +2179,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: div_out
-    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+    SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
   tags: pointwise
 
 - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@@ -2187,7 +2187,7 @@
   variants: function, method
   structured_delegate: div.out_mode
   dispatch:
-    SparseCPU, SparseCUDA: div_sparse
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse
   tags: [core, pointwise]
 
 - func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
@@ -2195,7 +2195,7 @@
   variants: method
   structured_delegate: div.out_mode
   dispatch:
-    SparseCPU, SparseCUDA: div_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse_
   tags: pointwise
 
 - func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
@@ -2204,7 +2204,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: div_out_mode
-    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+    SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
   tags: pointwise
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
@@ -2351,6 +2351,7 @@
   dispatch:
     CPU: _embedding_bag_forward_only_cpu
     CUDA: _embedding_bag_forward_only_cuda
+    MPS: _embedding_bag_forward_only_mps
   autogen: _embedding_bag_forward_only.out
 
 - func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
@@ -2372,6 +2373,7 @@
   dispatch:
     CPU: _embedding_bag_cpu
     CUDA: _embedding_bag_cuda
+    MPS: _embedding_bag_mps
   autogen: _embedding_bag.out
   tags: core
 
@@ -2517,7 +2519,7 @@
   dispatch:
     CompositeExplicitAutograd: empty_like
     QuantizedCPU, QuantizedCUDA: empty_like_quantized
-    SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
   autogen: empty_like.out
@@ -2768,20 +2770,20 @@
   variants: function, method
   dispatch:
     CPU, CUDA, MPS, MTIA: floor_divide
-    SparseCPU, SparseCUDA: floor_divide_sparse
+    SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CPU, CUDA, MPS: floor_divide_
-    SparseCPU, SparseCUDA: floor_divide_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse_
 
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA, MPS: floor_divide_out
-    SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
+    SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
 
 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -3856,7 +3858,7 @@
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
-    CPU, CUDA: aminmax_out
+    CPU, CUDA, MTIA: aminmax_out
     MPS: aminmax_out_mps
 
 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
@@ -3907,7 +3909,7 @@
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
-    CPU, CUDA: amax_out
+    CPU, CUDA, MTIA: amax_out
     MPS: amax_out_mps
 
 # Return: (Tensor output, Tensor indices)
@@ -4088,7 +4090,7 @@
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
-    CPU, CUDA: amin_out
+    CPU, CUDA, MTIA: amin_out
     MPS: amin_out_mps
 
 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
@@ -4241,6 +4243,7 @@
     CPU: _weight_int8pack_mm_cpu
     CUDA: _weight_int8pack_mm_cuda
     MPS: _weight_int8pack_mm_mps
+    XPU: _weight_int8pack_mm_xpu
 
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
   python_module: sparse
@@ -4273,7 +4276,7 @@
   structured_delegate: mul.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: mul_sparse
+    SparseCPU, SparseCUDA, SparseMPS: mul_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
     MkldnnCPU: mkldnn_mul
     ZeroTensor: mul_zerotensor
@@ -4285,7 +4288,7 @@
   structured_delegate: mul.out
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA: mul_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: mul_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
     MkldnnCPU: mkldnn_mul_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
@@ -4299,6 +4302,7 @@
     CPU, CUDA, MPS, MTIA: mul_out
     SparseCPU: mul_out_sparse_cpu
     SparseCUDA: mul_out_sparse_cuda
+    SparseMPS: mul_out_sparse_mps
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
     MkldnnCPU: mkldnn_mul_out
   tags: pointwise
@@ -4371,7 +4375,7 @@
   variants: function, method
   dispatch:
     CPU: narrow_copy_dense_cpu
-    SparseCPU, SparseCUDA: narrow_copy_sparse
+    SparseCPU, SparseCUDA, SparseMPS: narrow_copy_sparse
     CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
   tags: view_copy
 
@@ -5848,7 +5852,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: sum
-    SparseCPU, SparseCUDA, SparseMeta: sum_coo
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
   autogen: sum.out
 
@@ -5859,7 +5863,7 @@
   variants: function, method
   dispatch:
     NestedTensorCPU: NestedTensor_sum_dim_CPU
-    SparseCPU, SparseCUDA: sum_sparse_coo
+    SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
   tags: core
 
@@ -6491,7 +6495,7 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: unsqueeze
-    SparseCPU, SparseCUDA: unsqueeze_sparse
+    SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
     QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
   tags: core
@@ -6659,7 +6663,7 @@
 - func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: zeros_out
-    SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zeros_sparse_out
 
 - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
   dispatch:
@@ -6975,7 +6979,7 @@
     CPU, CUDA: sub_out
     MPS: sub_out_mps
     MTIA: sub_out_mtia
-    SparseCPU, SparseCUDA: sub_out_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sub_out_sparse
   tags: pointwise
 
 - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@@ -6983,7 +6987,7 @@
   variants: function, method
   structured_delegate: sub.out
   dispatch:
-    SparseCPU, SparseCUDA: sub_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sub_sparse
     ZeroTensor: sub_zerotensor
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
   tags: [core, pointwise]
@@ -6993,7 +6997,7 @@
   variants: method
   structured_delegate: sub.out
   dispatch:
-    SparseCPU, SparseCUDA: sub_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sub_sparse_
   tags: pointwise
 # For C++ only, until we have conversion from C++ numbers to Tensor
 
@@ -10258,7 +10262,7 @@
   structured_delegate: any.all_out
   variants: method, function
   dispatch:
-    SparseCPU, SparseCUDA: any_sparse
+    SparseCPU, SparseCUDA, SparseMPS: any_sparse
   tags: core
 
 - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -10342,7 +10346,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: pow_Tensor_Scalar_out
-    SparseCPU, SparseCUDA: pow_out_sparse_scalar
+    SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar
     MPS: pow_tensor_scalar_out_mps
   tags: pointwise
 
@@ -10351,7 +10355,7 @@
   structured_delegate: pow.Tensor_Scalar_out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: pow_sparse_scalar
+    SparseCPU, SparseCUDA, SparseMPS: pow_sparse_scalar
   tags: [core, pointwise]
 
 - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
@@ -10698,6 +10702,7 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
     CUDA: foreach_tensor_div_list_kernel_cuda
+    MTIA: foreach_tensor_div_list_kernel_mtia
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10705,6 +10710,7 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
+    MTIA: foreach_tensor_div_list_kernel_mtia_
   autogen: _foreach_div.List_out
 
 - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10728,6 +10734,7 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
     CUDA: foreach_tensor_div_tensor_kernel_cuda
+    MTIA: foreach_tensor_div_tensor_kernel_mtia
 
 - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10735,6 +10742,7 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
     CUDA: foreach_tensor_div_tensor_kernel_cuda_
+    MTIA: foreach_tensor_div_tensor_kernel_mtia_
   autogen: _foreach_div.Tensor_out
 
 - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10841,6 +10849,7 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
     CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+    MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
   autogen: _foreach_maximum.Scalar_out
 
 # foreach_minimum/maximum dispatches to clamp_max/min
diff --git a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
index 293fd600eaee3..7674798b26f28 100644
--- a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
+++ b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/ceil_div.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <c10/cuda/CUDAGuard.h>
@@ -21,10 +22,11 @@
 namespace at::native {
 
 namespace {
+template <typename T>
 __global__ void ChooseQuantizationParamsKernelImpl(
     const int64_t* fake_quant_on,
-    const float* x_min,
-    const float* x_max,
+    const T* x_min,
+    const T* x_max,
     int32_t qmin,
     int32_t qmax,
     int size,
@@ -93,34 +95,44 @@ __global__ void ChooseQuantizationParamsKernelImpl(
   }
 }
 
+__device__ inline bool isinf_device(float v) {
+  return ::isinf(v);
+}
+__device__ inline bool isinf_device(c10::BFloat16 v) {
+  return ::isinf(static_cast<float>(v));
+}
+
 // CUDA kernel to compute Moving Average Min/Max of the tensor.
 // It uses the running_min and running_max along with averaging const, c.
 // The formula used to compute the new min/max is as follows
 //
 // running_min = (1 - c) * running_min + c * x_min, if running_min != inf
 // running_min = x_min, if running_min == inf
+template <typename T>
 __global__ void MovingAverageMinMax(
     const int64_t* observer_on,
-    const float* x_min,
-    const float* x_max,
-    float* running_min,
-    float* running_max,
+    const T* x_min,
+    const T* x_max,
+    T* running_min,
+    T* running_max,
     const float averaging_const,
     const int size) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (*observer_on == 1) {
     if (i < size) {
-      float curr_min = x_min[i];
-      float curr_max = x_max[i];
+      T curr_min = x_min[i];
+      T curr_max = x_max[i];
 
-      float adjusted_min = ::isinf(running_min[i])
-          ? curr_min
-          : (running_min[i]) + averaging_const * (curr_min - (running_min[i]));
+      T averaging_const_t = static_cast<T>(averaging_const);
 
-      float adjusted_max = ::isinf(running_max[i])
-          ? curr_max
-          : (running_max[i]) + averaging_const * (curr_max - (running_max[i]));
+      T adjusted_min = isinf_device(running_min[i]) ? curr_min
+                                                    : (running_min[i]) +
+              averaging_const_t * (curr_min - (running_min[i]));
+
+      T adjusted_max = isinf_device(running_max[i]) ? curr_max
+                                                    : (running_max[i]) +
+              averaging_const_t * (curr_max - (running_max[i]));
 
       running_min[i] = adjusted_min;
       running_max[i] = adjusted_max;
@@ -142,40 +154,51 @@ void _calculate_moving_average(
   at::Tensor x_min, x_max;
 
   int64_t* observer_on_data = observer_on.data_ptr<int64_t>();
-  float* running_min_data = running_min.data_ptr<float>();
-  float* running_max_data = running_max.data_ptr<float>();
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
 
   if (per_row_fq) {
     std::tie(x_min, x_max) = at::aminmax(x, 1);
-    float* x_min_data = x_min.data_ptr<float>();
-    float* x_max_data = x_max.data_ptr<float>();
     int num_threads = std::min(size, (int64_t)512);
     const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+          scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
+          scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
 
-    // Moving Average Min/Max observer for activations
-    MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
-        observer_on_data,
-        x_min_data,
-        x_max_data,
-        running_min_data,
-        running_max_data,
-        averaging_const,
-        size);
+          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
+          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
+
+          // Moving Average Min/Max observer for activations
+          MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
+              observer_on_data,
+              x_min_data,
+              x_max_data,
+              running_min_data,
+              running_max_data,
+              averaging_const,
+              size);
+        });
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     std::tie(x_min, x_max) = at::aminmax(x);
-    float* x_min_data = x_min.data_ptr<float>();
-    float* x_max_data = x_max.data_ptr<float>();
-    // Moving Average Min/Max observer for activations
-    MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
-        observer_on_data,
-        x_min_data,
-        x_max_data,
-        running_min_data,
-        running_max_data,
-        averaging_const,
-        1 /*size*/);
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+          scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
+          scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
+
+          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
+          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
+
+          // Moving Average Min/Max observer for activations
+          MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
+              observer_on_data,
+              x_min_data,
+              x_max_data,
+              running_min_data,
+              running_max_data,
+              averaging_const,
+              1 /*size*/);
+        });
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }
@@ -198,34 +221,44 @@ void _calc_moving_avg_qparams_helper(
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
   int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
   if (per_row_fq) {
-    float* running_min_data = running_min.data_ptr<float>();
-    float* running_max_data = running_max.data_ptr<float>();
-    int num_threads = std::min(size, (int64_t)512);
-    const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
-    ChooseQuantizationParamsKernelImpl<<<num_blocks, num_threads, 0, cuda_stream>>>(
-        fake_quant_on_data,
-        running_min_data,
-        running_max_data,
-        qmin,
-        qmax,
-        size,
-        symmetric_quant,
-        scale_ptr,
-        zp_ptr);
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
+          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
+          int num_threads = std::min(size, (int64_t)512);
+          const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
+          ChooseQuantizationParamsKernelImpl<<<
+              num_blocks,
+              num_threads,
+              0,
+              cuda_stream>>>(
+              fake_quant_on_data,
+              running_min_data,
+              running_max_data,
+              qmin,
+              qmax,
+              size,
+              symmetric_quant,
+              scale_ptr,
+              zp_ptr);
+        });
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
-    float* running_min_data = running_min.data_ptr<float>();
-    float* running_max_data = running_max.data_ptr<float>();
-    ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
-        fake_quant_on_data,
-        running_min_data,
-        running_max_data,
-        qmin,
-        qmax,
-        1, // size
-        symmetric_quant, // preserve_sparsity
-        scale_ptr,
-        zp_ptr);
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
+          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
+          ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
+              fake_quant_on_data,
+              running_min_data,
+              running_max_data,
+              qmin,
+              qmax,
+              1, // size
+              symmetric_quant, // preserve_sparsity
+              scale_ptr,
+              zp_ptr);
+        });
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }
diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp
index 24fc7be1697be..a321074f60ea1 100644
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <ATen/Dispatch.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/native/sparse/ParamUtils.h>
 #include <ATen/native/SparseTensorUtils.h>
@@ -295,6 +296,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
     to exp functions as well as reuse of softmax implementation for
     log_softmax.
   */
+  using accscalar_t = at::acc_type<scalar_t, false>;
   auto sparse_dim = input.sparse_dim();
   auto indices = input._indices().contiguous();
   auto values = input._values().contiguous();
@@ -340,14 +342,14 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
           continue;
 
         /* Prepare scratch space */
-        std::vector<scalar_t> mx_row(nvalues, -std::numeric_limits<scalar_t>::infinity());
-        std::vector<scalar_t> exp_sums_row(nvalues, 0);
+        std::vector<accscalar_t> mx_row(nvalues, -std::numeric_limits<accscalar_t>::infinity());
+        std::vector<accscalar_t> exp_sums_row(nvalues, 0);
 
         /* Compute mx */
         for (int64_t i : pool_indices) {
           auto values_row = values_accessor[i];
           for (const auto j : c10::irange(nvalues)) {
-            mx_row[j] = std::max(mx_row[j], values_row[j]);
+            mx_row[j] = std::max(mx_row[j], accscalar_t(values_row[j]));
           }
         }
 
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 752365d545dee..e8c5fd013ba86 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -391,13 +391,13 @@ void _validate_sparse_coo_tensor_args(
   int64_t sparse_dim = indices.size(0);
   int64_t dense_dim = values.dim() - 1;
   TORCH_CHECK(
-      static_cast<int64_t>(size.size()) == sparse_dim + dense_dim,
-      "number of dimensions must be sparse_dim (",
-      sparse_dim,
-      ") + dense_dim (",
-      dense_dim,
-      "), but got ",
-      size.size());
+    sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
+    "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
+    size.size(),
+    ", sparse_dim = ",
+    sparse_dim,
+    ", dense_dim = ",
+    dense_dim);
 
   if (check_pinning) {
     TORCH_CHECK(
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
index de73ce612f10a..7841fe4c550f2 100644
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@@ -64,7 +64,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
   // create sparse descriptor, dtype
   cusparseLtMatDescriptor_t sparse_input_descriptor;
   cudaDataType type;
-  auto compression_factor = 9;
 
   #ifdef USE_ROCM
   TORCH_CHECK(isHipSparseLtSupported());
@@ -73,7 +72,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
   switch (sparse_input.scalar_type()) {
     case at::ScalarType::Char:
       type = CUDA_R_8I;
-      compression_factor = 10;
       break;
     case at::ScalarType::Half:
       type = CUDA_R_16F;
@@ -89,7 +87,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
     case at::ScalarType::Float8_e4m3fn:
       type = CUDA_R_8F_E4M3;
-      compression_factor = 10;
       break;
 #endif
     default:
@@ -97,10 +94,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
       break;
   }
 
-  // create a new compressed tensor with the same dtype as
-  auto compressed_tensor =
-      sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);
-
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
       &handle,
       &sparse_input_descriptor,
@@ -121,6 +114,15 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
       &compressed_size,
       &compressed_buffer_size));
 
+  // create a new compressed tensor with the same dtype as the input,
+  // and with packed data/metadata stored in an array with original
+  // number of rows, and sufficient columns to provide compressed_size
+  // buffer (in bytes)
+  size_t orig_m = sparse_input.size(0);
+  size_t div = orig_m * sparse_input.itemsize();
+  size_t new_n = (compressed_size + div - 1) / div; // floor
+  auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
+
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
   auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -165,7 +167,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
   cudaDataType output_type;
   cudaDataType C_type;
   cusparseComputeType compute_type;
-  auto compression_factor = 9;
 
   #ifdef USE_ROCM
   TORCH_CHECK(isHipSparseLtSupported());
@@ -177,7 +178,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
       output_type = CUDA_R_8I;
       C_type = CUDA_R_8I;
       compute_type = CUSPARSE_COMPUTE_32I;
-      compression_factor = 10;
       break;
 
 // cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
@@ -210,7 +210,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
       output_type = CUDA_R_8F_E4M3;
       C_type = CUDA_R_16F;
       compute_type = CUSPARSE_COMPUTE_32F;
-      compression_factor = 10;
       break;
 #endif
 // cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
@@ -300,9 +299,10 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
     }
   }
 
+  TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
-  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+  int64_t m = compressed_A.size(0);
 
   // initialize sparse descriptor
   cusparseLtMatDescriptor_t sparse_input_descriptor;
diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
index 07ee2e097b49e..589d000ab3187 100644
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@@ -10,6 +10,7 @@
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
+#include <ATen/ops/mul_native.h>
 #include <ATen/ops/empty_native.h>
 #include <ATen/ops/zeros_native.h>
 #include <ATen/ops/result_type.h>
@@ -20,10 +21,265 @@
 namespace at::native {
 
 using namespace at::sparse;
+using namespace mps;
 
-Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/Mul_metallib.h>
+#endif
+
+static SparseTensor& mul_out_dense_sparse_mps(
+    const Tensor& dense,
+    const Tensor& sparse,
+    SparseTensor& out) {
+
+  TORCH_CHECK(sparse.is_sparse(), "mul: expected 'sparse' to be sparse COO");
+  TORCH_CHECK(sparse.is_mps(), "mul: expected 'sparse' to be MPS, got ", sparse.device());
+  TORCH_CHECK(out.is_mps(), "mul: expected 'out' to be MPS, got ", out.device());
+
+  const bool scalar_like = (dense.dim() == 0) || (dense.numel() == 1);
+  TORCH_CHECK(dense.is_mps() || scalar_like,
+              "mul: expected 'dense' to be MPS or scalar-like, got ", dense.device());
+
+  const int64_t nnz = sparse._nnz();
+  out.resize_as_(sparse);
+
+  auto commonDtype = at::result_type(dense, sparse);
+  TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", out.scalar_type());
+
+  auto indices = sparse._indices().contiguous();
+  auto values  = sparse._values().to(commonDtype).contiguous();
+
+  if (nnz == 0) {
+    auto empty_vals = values.narrow(0, 0, 0);
+    alias_into_sparse(out,
+                      indices.narrow(1, 0, 0),
+                      (out.scalar_type() == commonDtype) ? empty_vals
+                                                          : empty_vals.to(out.scalar_type()));
+    out._coalesced_(sparse.is_coalesced());
+    return out;
+  }
+
+  if (scalar_like) {
+    auto scalar = dense;
+    if (dense.numel() == 1 && dense.dim() > 0) {
+      scalar = dense.view({});
+    }
+    scalar = scalar.to(values.options());
+    auto out_vals = values.mul(scalar);
+    if (out.scalar_type() != commonDtype) {
+      out_vals = out_vals.to(out.scalar_type());
+    }
+
+    alias_into_sparse(out, indices, out_vals);
+    out._coalesced_(sparse.is_coalesced());
+    return out;
+  }
+
+  TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
+              "mul(dense, sparse): sizes must match exactly (no broadcasting): ",
+              dense.sizes(), " vs ", sparse.sizes());
+
+  const int64_t ndim_i = sparse.sparse_dim();
+  const int64_t ndim = dense.dim();
+  TORCH_CHECK(
+    ndim_i <= ndim,
+    "mul(dense, sparse): sparse_dim=", ndim_i, " exceeds dense.dim()=", ndim);
+
+  // Prepare shapes
+  int64_t view_rows = 1, view_cols = 1;
+  for (int64_t i = 0; i < ndim_i; ++i) view_rows *= sparse.size(i);
+  for (int64_t i = ndim_i; i < ndim; ++i) view_cols *= sparse.size(i);
+
+  auto dense_mps = dense.to(commonDtype).contiguous().reshape({view_rows, view_cols});
+  auto out_vals = at::empty_like(values, values.options());
+
+  const uint32_t u_view_cols = static_cast<uint32_t>(view_cols);
+  const uint32_t u_nnz = static_cast<uint32_t>(nnz);
+  const uint32_t u_ndim_i = static_cast<uint32_t>(ndim_i);
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("dense_sparse_mul_kernel_" + mps::scalarToMetalTypeString(values));
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:pso];
+
+      const uint32_t gridWidth = u_view_cols;
+      const uint32_t gridDepth = u_nnz;
+      MTLSize gridSize = MTLSizeMake(gridWidth, 1, gridDepth);
+
+      const uint32_t maxThreadsPerGroup = pso.maxTotalThreadsPerThreadgroup;
+      const uint32_t tew = pso.threadExecutionWidth;
+      uint32_t tgWidth  = std::min(gridWidth, tew);
+      MTLSize threadgroupSize = MTLSizeMake(tgWidth, 1, 1);
+
+      mtl_setArgs(
+        computeEncoder,
+        dense_mps,
+        values,
+        out_vals,
+        indices,
+        sparse.sizes(),
+        std::array<uint32_t, 3>{u_nnz, u_ndim_i, u_view_cols}
+      );
+
+      [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
+    }
+  });
+
+  Tensor final_vals = out_vals;
+  if (out.scalar_type() != commonDtype) {
+    final_vals = final_vals.to(out.scalar_type());
+  }
+
+  alias_into_sparse(out, indices, final_vals);
+  out._coalesced_(sparse.is_coalesced());
+  return out;
+}
+
+
+SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
+  TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
+
+  // Dense x sparse fallback (keep dense first)
+  if (!t_.is_sparse() || !src_.is_sparse()) {
+    const Tensor& dense  = t_.is_sparse() ? src_ : t_;
+    const Tensor& sparse = t_.is_sparse() ? t_   : src_;
+    return mul_out_dense_sparse_mps(dense, sparse, r_);
+  }
+
+  TORCH_CHECK(t_.is_mps(),   "mul: expected 'self' to be MPS, but got ", t_.device());
+  TORCH_CHECK(src_.is_mps(), "mul: expected 'other' to be MPS, but got ", src_.device());
+  TORCH_CHECK(t_.sparse_dim() == src_.sparse_dim(),
+              "mul(sparse, sparse): must have same sparse_dim, got ",
+              t_.sparse_dim(), " vs ", src_.sparse_dim());
+  TORCH_CHECK(t_.sizes().equals(src_.sizes()),
+              "mul(sparse, sparse): sizes must match exactly (no broadcasting).");
+
+  // Coalesce and early-exit on structurally empty operands
+  auto lhs = t_.coalesce();
+  auto rhs = src_.coalesce();
+  const int64_t lhs_nnz = lhs._nnz();
+  const int64_t rhs_nnz = rhs._nnz();
+  if (!lhs_nnz || !rhs_nnz) {
+    r_.resize_as_(lhs);
+    return r_.zero_();
+  }
+
+  // dtype checks and promotion
+  auto commonDtype = at::result_type(lhs, rhs);
+  TORCH_CHECK(canCast(commonDtype, r_.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", r_.scalar_type());
+
+  const int64_t ndim_i = lhs.sparse_dim();
+
+  // ndim_i == 0, at most one structural entry
+  if (ndim_i == 0) {
+    r_.resize_as_(lhs);
+    const bool has = (lhs_nnz && rhs_nnz);
+
+    auto out_indices = lhs._indices().narrow(1, 0, has ? 1 : 0);
+
+    Tensor lhs_vals = lhs._values().to(commonDtype);
+    Tensor rhs_vals = rhs._values().to(commonDtype);
+    lhs_vals = lhs_vals.narrow(0, 0, has ? 1 : 0);
+    rhs_vals = rhs_vals.narrow(0, 0, has ? 1 : 0);
+
+    Tensor out_values = lhs_vals.mul(rhs_vals);
+    if (r_.scalar_type() != commonDtype) {
+      out_values = out_values.to(r_.scalar_type());
+    }
+
+    alias_into_sparse(r_, out_indices, out_values);
+    r_._coalesced_(true);
+    return r_;
+  }
+
+  // General path, intersect keys, then gather + multiply on GPU
+  const auto device = r_.device();
+  auto stream = getCurrentMPSStream();
+
+  auto lhs_indices = lhs._indices();
+  auto rhs_indices = rhs._indices();
+  auto lhs_values  = lhs._values().to(commonDtype);
+  auto rhs_values  = rhs._values().to(commonDtype);
+
+  // Flatten sparse indices to keys
+  auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes());
+  auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes());
+
+  // Intersect sorted keys (search the shorter in the longer)
+  const bool A_is_lhs = (lhs_nnz <= rhs_nnz);
+  const int64_t lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
+  const int64_t lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
+  auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
+  auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
+
+  auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
+  auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
+  auto counter = at::zeros({1}, at::device(device).dtype(kInt));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
+                  static_cast<uint32_t>(lenB), A_is_lhs);
+      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
+    }
+  });
+
+  const uint32_t M = counter.item<int32_t>(); // number of structural matches
+
+  r_.resize_as_(lhs);
+
+  auto out_indices = at::empty({ndim_i, static_cast<int64_t>(M)}, at::device(device).dtype(at::kLong));
+  auto lhs_match = outA_idx.narrow(0, 0, M);
+  auto rhs_match = outB_idx.narrow(0, 0, M);
+  auto out_val_sizes = lhs_values.sizes().vec();
+  out_val_sizes[0] = static_cast<int64_t>(M);
+  auto out_values = at::empty(out_val_sizes, lhs_values.options());
+
+  const uint32_t cols = static_cast<uint32_t>(
+      lhs_values.numel() / std::max<int64_t>(1, lhs_nnz));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc(
+          "fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew  = pso.threadExecutionWidth;
+      uint32_t tgW = std::min(cols, tew);
+      MTLSize grid = MTLSizeMake(cols, 1, M);
+      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  lhs_values, rhs_values,
+                  lhs_match, rhs_match,
+                  lhs_indices, out_indices,
+                  out_values,
+                  std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
+                  std::array<uint32_t, 2>{M, cols});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+
+  if (r_.scalar_type() != commonDtype) {
+    out_values = out_values.to(r_.scalar_type());
+  }
+
+  alias_into_sparse(r_, out_indices, out_values);
+  r_._coalesced_(true);
+  return r_;
+}
 
-Tensor& add_out_dense_sparse_mps(
+static Tensor& add_out_dense_sparse_mps(
     Tensor& out,
     const Tensor& dense,
     const SparseTensor& sparse,
diff --git a/aten/src/ATen/native/sparse/mps/kernels/Mul.metal b/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
new file mode 100644
index 0000000000000..4a9caa393f94c
--- /dev/null
+++ b/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
@@ -0,0 +1,150 @@
+#include <metal_stdlib>
+#include <c10/metal/indexing.h>
+using namespace metal;
+
+
+template <typename T>
+kernel void dense_sparse_mul_kernel(
+    device const T* dense         [[buffer(0)]],
+    device const T* values        [[buffer(1)]],
+    device T* out_values          [[buffer(2)]],
+    device const long* indices    [[buffer(3)]],
+    device const long* sizes      [[buffer(4)]],
+    constant uint3& sparse_params [[buffer(5)]],
+    uint3 gid                     [[thread_position_in_grid]])
+{
+  uint col = gid.x;
+  uint i = gid.z;
+  uint nnz = sparse_params.x;
+  uint ndim_i = sparse_params.y;
+  uint view_cols = sparse_params.z;
+
+  long key = 0;
+  for (uint d = 0; d < ndim_i; ++d) {
+    long idx_d = indices[(ulong)d * (ulong)nnz + (ulong)i];
+    const auto sz_d  = sizes[d];
+    key = key * sz_d + idx_d;
+  }
+
+  ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
+  ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
+
+  const auto a = static_cast<float>(values[val_idx]);
+  const auto b = static_cast<float>(dense[dense_idx]);
+  out_values[val_idx] = static_cast<T>(a * b);
+}
+
+kernel void intersect_binary_search(
+    device const long*  keysA        [[buffer(0)]],
+    device const long*  keysB        [[buffer(1)]],
+    device long*        outA_idx     [[buffer(2)]],
+    device long*        outB_idx     [[buffer(3)]],
+    device atomic_uint* counter      [[buffer(4)]],
+    constant uint&      lenB         [[buffer(5)]],
+    constant bool&      A_is_lhs     [[buffer(6)]],
+    uint3               tid_in_grid  [[thread_position_in_grid]])
+{
+  uint gid = tid_in_grid.x;
+
+  long key = keysA[gid];
+
+  // lower_bound in B
+  uint lo = 0;
+  uint hi = lenB;
+  while (lo < hi) {
+    uint mid = (lo + hi) >> 1;
+    long v = keysB[mid];
+    if (v < key) lo = mid + 1;
+    else         hi = mid;
+  }
+
+  if (lo < lenB && keysB[lo] == key) {
+    uint pos = atomic_fetch_add_explicit(counter, 1u, memory_order_relaxed);
+    if (A_is_lhs) {
+      outA_idx[pos] = (long)gid;
+      outB_idx[pos] = (long)lo;
+    } else {
+      outA_idx[pos] = (long)lo;
+      outB_idx[pos] = (long)gid;
+    }
+  }
+}
+
+
+template <typename T>
+kernel void fused_gather_mul_kernel(
+    device const T*    lhs_vals      [[buffer(0)]],
+    device const T*    rhs_vals      [[buffer(1)]],
+    device const long* lhs_sel       [[buffer(2)]],
+    device const long* rhs_sel       [[buffer(3)]],
+    device const long* lhs_indices   [[buffer(4)]],
+    device long*       out_indices   [[buffer(5)]],
+    device T*          out_vals      [[buffer(6)]],
+    constant uint2&    dims_input    [[buffer(7)]],
+    constant uint2&    dims_output   [[buffer(8)]],
+    uint3              gid           [[thread_position_in_grid]])
+{
+  const uint col = gid.x;
+  const uint k = gid.z;
+  const uint n_dim_i = dims_input.x;
+  const uint L = dims_input.y;
+  const uint M = dims_output.x;
+  const uint view_cols = dims_output.y;
+
+  const long iL = lhs_sel[k];
+  const long iR = rhs_sel[k];
+
+  if (col < view_cols) {
+    const ulong offL = (ulong)iL * (ulong)view_cols + (ulong)col;
+    const ulong offR = (ulong)iR * (ulong)view_cols + (ulong)col;
+    const ulong offO = (ulong)k  * (ulong)view_cols + (ulong)col;
+
+    const float a = (float)lhs_vals[offL];
+    const float b = (float)rhs_vals[offR];
+    out_vals[offO] = (T)(a * b);
+  }
+
+  // One thread per match copies the indices column
+  if (col == 0) {
+    const ulong uL = (ulong)L;
+    const ulong uM = (ulong)M;
+    const ulong src_col = (ulong)iL; // gather from lhs
+    for (uint d = 0; d < n_dim_i; ++d) {
+      const long v = lhs_indices[(ulong)d * uL + src_col];
+      out_indices[(ulong)d * uM + (ulong)k] = v;
+    }
+  }
+}
+
+#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE)                                 \
+  template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void     \
+  dense_sparse_mul_kernel<DTYPE>(                                           \
+      device const DTYPE* dense         [[buffer(0)]],                      \
+      device const DTYPE* values        [[buffer(1)]],                      \
+      device DTYPE* out_values          [[buffer(2)]],                      \
+      device const long* indices        [[buffer(3)]],                      \
+      device const long* sizes          [[buffer(4)]],                      \
+      constant uint3& sparse_params     [[buffer(5)]],                      \
+      uint3 gid                         [[thread_position_in_grid]]);
+
+INSTANTIATE_DENSE_SPARSE_MUL(float);
+INSTANTIATE_DENSE_SPARSE_MUL(half);
+INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
+
+#define INSTANTIATE_FUSED_GATHER_MUL(DTYPE)                                  \
+  template [[host_name("fused_gather_mul_kernel_" #DTYPE)]] kernel void      \
+  fused_gather_mul_kernel<DTYPE>(                                            \
+      device const DTYPE* lhs_vals      [[buffer(0)]],                       \
+      device const DTYPE* rhs_vals      [[buffer(1)]],                       \
+      device const long*  lhs_sel       [[buffer(2)]],                       \
+      device const long*  rhs_sel       [[buffer(3)]],                       \
+      device const long*  lhs_indices   [[buffer(4)]],                       \
+      device long*        out_indices   [[buffer(5)]],                       \
+      device DTYPE*       out_vals      [[buffer(6)]],                       \
+      constant uint2&     dims_input    [[buffer(7)]],                       \
+      constant uint2&     dims_output   [[buffer(8)]],                       \
+      uint3               gid           [[thread_position_in_grid]]);
+
+INSTANTIATE_FUSED_GATHER_MUL(float);
+INSTANTIATE_FUSED_GATHER_MUL(half);
+INSTANTIATE_FUSED_GATHER_MUL(bfloat);
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index b8b43e0086c1a..c2193f2378dd5 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -95,6 +95,72 @@
 #endif
 #endif
 
+#if defined(USE_ROCM) && (defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION))
+namespace pytorch_flash
+{
+std::tuple<
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor>
+mha_fwd(
+    const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size
+    const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size
+    const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size
+    std::optional<at::Tensor>&
+        out_, // batch_size x seqlen_q x num_heads x head_size
+    std::optional<at::Tensor>&
+        alibi_slopes_, // num_heads or batch_size x num_heads
+    const float p_dropout,
+    const float softmax_scale,
+    bool is_causal,
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+    const float softcap,
+    const bool return_softmax,
+    std::optional<at::Generator> gen_) {
+#if defined(USE_ROCM_CK_SDPA)
+  if (at::globalContext().getROCmFAPreferredBackend() ==
+      at::ROCmFABackend::Ck) {
+    const int non_null_window_left = window_size_left.value_or(-1);
+    const int non_null_window_right = window_size_right.value_or(-1);
+    std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
+    return mha_fwd_ck(
+        q,
+        k,
+        v,
+        out_,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        non_null_window_left,
+        non_null_window_right,
+        return_softmax,
+        gen_,
+        dummy_attn_bias); // Not used in flash attention
+  }
+#endif
+  return mha_fwd_aot(
+      q,
+      k,
+      v,
+      out_,
+      alibi_slopes_,
+      p_dropout,
+      softmax_scale,
+      is_causal,
+      window_size_left,
+      window_size_right,
+      return_softmax,
+      gen_);
+}
+}
+#endif
+
 namespace at {
 
 namespace cuda::philox {
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
index f6f2240d4f091..71a1959065970 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
@@ -270,7 +270,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varle
 #endif
 
 TORCH_API
-inline std::tuple<
+std::tuple<
     at::Tensor,
     at::Tensor,
     at::Tensor,
@@ -294,42 +294,7 @@ mha_fwd(
     std::optional<int64_t> window_size_right,
     const float softcap,
     const bool return_softmax,
-    std::optional<at::Generator> gen_) {
-#if defined(USE_ROCM_CK_SDPA)
-  if (at::globalContext().getROCmFAPreferredBackend() ==
-      at::ROCmFABackend::Ck) {
-    const int non_null_window_left = window_size_left.value_or(-1);
-    const int non_null_window_right = window_size_right.value_or(-1);
-    std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
-    return mha_fwd_ck(
-        q,
-        k,
-        v,
-        out_,
-        p_dropout,
-        softmax_scale,
-        is_causal,
-        non_null_window_left,
-        non_null_window_right,
-        return_softmax,
-        gen_,
-        dummy_attn_bias); // Not used in flash attention
-  }
-#endif
-  return mha_fwd_aot(
-      q,
-      k,
-      v,
-      out_,
-      alibi_slopes_,
-      p_dropout,
-      softmax_scale,
-      is_causal,
-      window_size_left,
-      window_size_right,
-      return_softmax,
-      gen_);
-}
+    std::optional<at::Generator> gen_);
 
 inline std::tuple<
     at::Tensor,
diff --git a/aten/src/ATen/test/cuda_allocator_test.cpp b/aten/src/ATen/test/cuda_allocator_test.cpp
index 5aa2378c22c49..27a352e7d5a26 100644
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@@ -5,51 +5,6 @@
 
 #include <ATen/test/allocator_clone_test.h>
 
-#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
-
 TEST(AllocatorTestCUDA, test_clone) {
   test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
-
-static int called_dummy_free_0 = 0;
-static int called_dummy_free_1 = 0;
-
-void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
-void dummy_free_0(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_0++;
-}
-void dummy_free_1(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_1++;
-}
-
-// Tests that data_ptrs have their respective deleters
-// when mixing allocators
-TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
-  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
-  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
-  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
-  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
-  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Manually use a's deleter
-  auto* ctx = a.storage().data_ptr().get_context();
-  a.storage().data_ptr().get_deleter()(ctx);
-  a.storage().mutable_data_ptr().release_context();
-
-  // a's deleter is dummy_free_0
-  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
-  ASSERT_TRUE(called_dummy_free_0 == 1);
-
-  // Manually use b's deleter
-  ctx = b.storage().data_ptr().get_context();
-  b.storage().data_ptr().get_deleter()(ctx);
-  b.storage().mutable_data_ptr().release_context();
-
-  // b's deleter is dummy_free_1
-  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
-  ASSERT_TRUE(called_dummy_free_1 == 1);
-}
diff --git a/aten/src/ATen/test/mps_test_objc_interface.mm b/aten/src/ATen/test/mps_test_objc_interface.mm
index f59ca955d00d2..45811ed804802 100644
--- a/aten/src/ATen/test/mps_test_objc_interface.mm
+++ b/aten/src/ATen/test/mps_test_objc_interface.mm
@@ -42,7 +42,7 @@ kernel void add_arrays(device const float* inA,
     id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource: [NSString stringWithUTF8String:CUSTOM_KERNEL]
                                                               options: nil
                                                                 error: &error];
-    TORCH_CHECK(customKernelLibrary, "Failed to to create custom kernel library, error: ", error.localizedDescription.UTF8String);
+    TORCH_CHECK(customKernelLibrary, "Failed to create custom kernel library, error: ", error.localizedDescription.UTF8String);
 
     id<MTLFunction> customFunction = [customKernelLibrary newFunctionWithName: @"add_arrays"];
     TORCH_CHECK(customFunction, "Failed to create function state object for the kernel");
diff --git a/aten/src/ATen/xpu/XPUContext.cpp b/aten/src/ATen/xpu/XPUContext.cpp
index 2157e34648b82..e956ec9a16599 100644
--- a/aten/src/ATen/xpu/XPUContext.cpp
+++ b/aten/src/ATen/xpu/XPUContext.cpp
@@ -76,4 +76,23 @@ int32_t getGlobalIdxFromDevice(DeviceIndex device) {
   return device_global_idxs[device];
 }
 
+// Check if a device can access the memory of a peer device directly.
+bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer) {
+  if (device == -1) {
+    device = c10::xpu::current_device();
+  }
+  if (peer == -1) {
+    peer = c10::xpu::current_device();
+  }
+  check_device_index(device);
+  check_device_index(peer);
+  // A device can always access itself
+  if (device == peer) {
+    return true;
+  }
+  return c10::xpu::get_raw_device(device).ext_oneapi_can_access_peer(
+      c10::xpu::get_raw_device(peer),
+      sycl::ext::oneapi::peer_access::access_supported);
+}
+
 } // namespace at::xpu
diff --git a/aten/src/ATen/xpu/XPUContext.h b/aten/src/ATen/xpu/XPUContext.h
index fb8fbe9c0aa42..a473f317ca3d1 100644
--- a/aten/src/ATen/xpu/XPUContext.h
+++ b/aten/src/ATen/xpu/XPUContext.h
@@ -17,4 +17,6 @@ TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device);
 
 TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device);
 
+TORCH_XPU_API bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer);
+
 } // namespace at::xpu
diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py
index 678cee5f752c3..c90e4da33b125 100644
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@@ -72,6 +72,14 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                 "timm_vovnet",
                 "torchrec_dlrm",
                 "vgg16",
+                # LLM
+                "meta-llama/Llama-3.2-1B",
+                "google/gemma-2-2b",
+                "google/gemma-3-4b-it",
+                "openai/whisper-tiny",
+                "Qwen/Qwen3-0.6B",
+                "mistralai/Mistral-7B-Instruct-v0.3",
+                "openai/gpt-oss-20b",
             }
         )
 
diff --git a/benchmarks/dynamo/check_graph_breaks.py b/benchmarks/dynamo/check_graph_breaks.py
index 57814dacd00b3..cef7259206e3c 100644
--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@@ -55,6 +55,14 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                 "timm_nfnet",
                 "torchrec_dlrm",
                 "vgg16",
+                # LLM
+                "meta-llama/Llama-3.2-1B",
+                "google/gemma-2-2b",
+                "google/gemma-3-4b-it",
+                "openai/whisper-tiny",
+                "Qwen/Qwen3-0.6B",
+                "mistralai/Mistral-7B-Instruct-v0.3",
+                "openai/gpt-oss-20b",
             }
         )
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
index 0f088e7892d8f..59b0b7d723e94 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@@ -171,3 +171,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,0
+
+
+
+google/gemma-2-2b,pass,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
index f65909f3a24ea..37e1b792b3dc3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
index 1d199fe8ea664..a30b81b0bc9cb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,8
+hf_Reformer,pass,5
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,11
+hf_T5_generate,pass,7
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
index 54b7d63f3a4bc..31e0fbf07f02b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,20
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
index 169a42ff7cd41..cd1f40fa49d6c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@@ -167,3 +167,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,fail_to_run,0
+
+
+
+google/gemma-2-2b,fail_to_run,0
+
+
+
+google/gemma-3-4b-it,fail_to_run,0
+
+
+
+openai/whisper-tiny,fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,fail_to_run,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,fail_to_run,0
+
+
+
+openai/gpt-oss-20b,fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
index 0f088e7892d8f..83523ea778a98 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@@ -171,3 +171,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass_due_to_skip,0
+
+
+
+google/gemma-2-2b,pass_due_to_skip,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass_due_to_skip,0
+
+
+
+Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
index e68aa2fa5351f..4266cfada1dcb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@@ -98,11 +98,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
@@ -138,7 +138,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,25
+hf_BigBird,pass,27
 
 
 
@@ -158,7 +158,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,8
+hf_Reformer,pass,5
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
index 0f088e7892d8f..83523ea778a98 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@@ -171,3 +171,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass_due_to_skip,0
+
+
+
+google/gemma-2-2b,pass_due_to_skip,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass_due_to_skip,0
+
+
+
+Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
index aec659fdcd654..6b4a1ba4d0019 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@@ -98,11 +98,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
@@ -138,7 +138,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,25
+hf_BigBird,pass,27
 
 
 
@@ -158,7 +158,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,8
+hf_Reformer,pass,5
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
index 0f088e7892d8f..83523ea778a98 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@@ -171,3 +171,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass_due_to_skip,0
+
+
+
+google/gemma-2-2b,pass_due_to_skip,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass_due_to_skip,0
+
+
+
+Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
index 4f2eec1493520..4c45921c8881c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -98,11 +98,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
@@ -138,7 +138,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,25
+hf_BigBird,pass,27
 
 
 
@@ -158,7 +158,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,8
+hf_Reformer,pass,5
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
index 0f088e7892d8f..59b0b7d723e94 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@@ -171,3 +171,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,0
+
+
+
+google/gemma-2-2b,pass,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
index f65909f3a24ea..37e1b792b3dc3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
index 20cad351b1275..536b0bccc1375 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,8
+hf_Reformer,pass,5
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,11
+hf_T5_generate,pass,7
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
index 5050b3762ed96..9cc5cfb79daeb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,20
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
index 0f088e7892d8f..59b0b7d723e94 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@@ -171,3 +171,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,0
+
+
+
+google/gemma-2-2b,pass,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
index c8db4d5823203..1ec72cc22b8fa 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@@ -82,11 +82,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
@@ -122,7 +122,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,25
+hf_BigBird,pass,27
 
 
 
@@ -142,7 +142,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,8
+hf_Reformer,pass,5
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
index f4c9ffddd9974..39149853947c3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@@ -98,11 +98,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
index 0f088e7892d8f..bfb8438d5bcd6 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@@ -171,3 +171,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,0
+
+
+
+google/gemma-2-2b,pass,0
+
+
+
+google/gemma-3-4b-it,pass,0
+
+
+
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
index f65909f3a24ea..37e1b792b3dc3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
index 2b2c1a504647f..7223db482a22f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,8
+hf_Reformer,pass,5
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,11
+hf_T5_generate,pass,7
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
index 89871fd49a04b..ad395d03f2b59 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,20
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
index 0f088e7892d8f..59b0b7d723e94 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@@ -171,3 +171,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,0
+
+
+
+google/gemma-2-2b,pass,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
index f65909f3a24ea..37e1b792b3dc3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
index 1d199fe8ea664..a30b81b0bc9cb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,8
+hf_Reformer,pass,5
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,11
+hf_T5_generate,pass,7
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
index 0985e42fc5cb9..cc545dfe39c61 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,20
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
index 0f088e7892d8f..59b0b7d723e94 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@@ -171,3 +171,31 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,0
+
+
+
+google/gemma-2-2b,pass,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
index f65909f3a24ea..37e1b792b3dc3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
index e41018657c0e2..1647558f063b8 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,8
+hf_Reformer,pass,5
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,11
+hf_T5_generate,pass,7
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
index 54b7d63f3a4bc..31e0fbf07f02b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,20
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
index 0f088e7892d8f..b759310805957 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
index 08061de428d71..7ec7c3c8482c9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
index 6f316b219bb92..0bff2806fc74e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
@@ -181,7 +181,7 @@ hf_T5_base,pass,0
 
 
 
-hf_T5_generate,pass,11
+hf_T5_generate,pass,7
 
 
 
@@ -205,7 +205,7 @@ llama,pass,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
index ce334e22c698b..a49a27a8223d6 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
@@ -167,3 +167,23 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,fail_accuracy,0
+
+
+
+google/gemma-2-2b,fail_accuracy,0
+
+
+
+google/gemma-3-4b-it,fail_accuracy,0
+
+
+
+openai/whisper-tiny,fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,fail_accuracy,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
index cc4ef192ca536..1e1646d1caf5d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
@@ -178,7 +178,7 @@ llama,fail_to_run,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
index 0f088e7892d8f..b759310805957 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
index 08061de428d71..7ec7c3c8482c9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
index 4b5138ce9c367..1df59c9e5f7b8 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
@@ -181,7 +181,7 @@ hf_T5_base,pass,0
 
 
 
-hf_T5_generate,pass,11
+hf_T5_generate,pass,7
 
 
 
@@ -205,7 +205,7 @@ llama,pass,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
index 0f088e7892d8f..b5e1a0989e74a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
index f65909f3a24ea..37e1b792b3dc3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
index a3fc7cf192371..a394375dbfa39 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
@@ -198,7 +198,7 @@ llama,pass,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
index 0f088e7892d8f..b759310805957 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
index 08061de428d71..2f09e06da5e3b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_failed_to_run,0
+
+
+
+google/gemma-2-2b,eager_failed_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_failed_to_run,0
+
+
+
+openai/whisper-tiny,eager_failed_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_failed_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
index 6f316b219bb92..0bff2806fc74e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
@@ -181,7 +181,7 @@ hf_T5_base,pass,0
 
 
 
-hf_T5_generate,pass,11
+hf_T5_generate,pass,7
 
 
 
@@ -205,7 +205,7 @@ llama,pass,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
index 0f088e7892d8f..b759310805957 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
index f65909f3a24ea..37e1b792b3dc3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
@@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
index 8ccf95da9659e..7377ab719b4a4 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
@@ -198,7 +198,7 @@ llama,pass,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 2901009f7c4d1..eede963a1f698 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -733,7 +733,7 @@ def vary_batch(t: torch.Tensor, new_batch_size) -> torch.Tensor:
 
     time_total = 0
     # Dont collect outputs to correctly measure timing
-    for _ in range(times):
+    for i in range(times):
         # If batch_size is 1, it too often collides with other non batch size
         # dimensions resulting in errors.
         if batch_size and batch_size > 1:
@@ -1106,7 +1106,13 @@ def maybe_mark_profile(*args, **kwargs):
         elif args.torchscript_jit_trace:
             frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
         else:
-            frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
+            if kwargs["hf_llm"]:
+                # If it's an llm, we want to optimize model.forward, and use
+                # the generate function
+                model.forward = torch._dynamo.run(model)
+                frozen_model_iter_fn = model_iter_fn
+            else:
+                frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
 
         for rep in trange(args.repeat, desc="running benchmark"):
             inputs = (
@@ -1120,7 +1126,10 @@ def maybe_mark_profile(*args, **kwargs):
             maybe_mark_step(args)
 
             # interleave the runs to handle frequency scaling and load changes
-            with maybe_mark_profile(p=p, mark="expected"):
+            with (
+                maybe_mark_profile(p=p, mark="expected"),
+                torch.compiler.set_stance("force_eager"),
+            ):
                 timings[rep, 0], expected_output = timed(
                     model,
                     model_iter_fn,
@@ -2233,11 +2242,12 @@ def record_status(accuracy_status, dynamo_start_stats):
             reset_rng_state()
             model_copy = None
             try:
-                model_copy = self.deepcopy_and_maybe_parallelize(model)
-                self.init_optimizer(name, current_device, model_copy.parameters())
-                correct_result = self.run_n_iterations(
-                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
-                )
+                with torch.compiler.set_stance("force_eager"):
+                    model_copy = self.deepcopy_and_maybe_parallelize(model)
+                    self.init_optimizer(name, current_device, model_copy.parameters())
+                    correct_result = self.run_n_iterations(
+                        model_copy, clone_inputs(example_inputs), self.model_iter_fn
+                    )
             except Exception as e:
                 accuracy_status = (
                     "eager_1st_run_OOM"
@@ -2254,11 +2264,12 @@ def record_status(accuracy_status, dynamo_start_stats):
             reset_rng_state()
             model_copy = None
             try:
-                model_copy = self.deepcopy_and_maybe_parallelize(model)
-                self.init_optimizer(name, current_device, model_copy.parameters())
-                correct_rerun_result = self.run_n_iterations(
-                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
-                )
+                with torch.compiler.set_stance("force_eager"):
+                    model_copy = self.deepcopy_and_maybe_parallelize(model)
+                    self.init_optimizer(name, current_device, model_copy.parameters())
+                    correct_rerun_result = self.run_n_iterations(
+                        model_copy, clone_inputs(example_inputs), self.model_iter_fn
+                    )
             except Exception as e:
                 accuracy_status = (
                     "eager_2nd_run_OOM"
@@ -2542,7 +2553,11 @@ def warmup(fn, model, example_inputs, mode, niters=10):
                     )
 
             baseline_timings = experiment(
-                model, example_inputs, mark="expected", **experiment_kwargs
+                self.model_iter_fn,
+                model,
+                example_inputs,
+                mark="expected",
+                **experiment_kwargs,
             )
 
             if self.args.export_aot_inductor:
@@ -2610,7 +2625,11 @@ def warmup(fn, model, example_inputs, mode, niters=10):
                     )
 
             backend_timings = experiment(
-                model, example_inputs, mark="expected", **experiment_kwargs
+                self.model_iter_fn,
+                model,
+                example_inputs,
+                mark="expected",
+                **experiment_kwargs,
             )
             timings = np.stack((baseline_timings, backend_timings), axis=1)
             result_summary = latency_experiment_summary(
@@ -2629,9 +2648,17 @@ def run_performance_test(
         tag=None,
         batch_size=None,
     ):
+        niters = 5
+        if getattr(self, "hf_llm", False):
+            # If we're benchmarking an llm, we want to use the generate function
+            self.model_iter_fn = self.generate
+            niters = 1
+
         if self.args.xla:
             with self.pick_grad(name, self.args.training):
-                return experiment(*self.maybe_cast(model, example_inputs))
+                return experiment(
+                    self.model_iter_fn, *self.maybe_cast(model, example_inputs)
+                )
 
         def warmup(fn, model, example_inputs, mode, niters=5):
             gc.collect()
@@ -2696,17 +2723,22 @@ def warmup(fn, model, example_inputs, mode, niters=5):
             with maybe_snapshot_memory(
                 self.args.snapshot_memory, f"eager_{self.args.only}"
             ):
-                eager_latency, eager_peak_mem, _ = warmup(
-                    self.model_iter_fn, copy.deepcopy(model), example_inputs, "eager"
-                )
-                if self.args.use_warm_peak_memory:
-                    _, eager_peak_mem, _ = warmup(
+                with torch.compiler.set_stance("force_eager"):
+                    eager_latency, eager_peak_mem, _ = warmup(
                         self.model_iter_fn,
                         copy.deepcopy(model),
                         example_inputs,
                         "eager",
-                        niters=1,
+                        niters=niters,
                     )
+                    if self.args.use_warm_peak_memory:
+                        _, eager_peak_mem, _ = warmup(
+                            self.model_iter_fn,
+                            copy.deepcopy(model),
+                            example_inputs,
+                            "eager",
+                            niters=1,
+                        )
 
             if (
                 self.args.export_aot_inductor
@@ -2715,7 +2747,13 @@ def warmup(fn, model, example_inputs, mode, niters=5):
             ):
                 optimized_model_iter_fn = optimize_ctx
             else:
-                optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
+                if getattr(self, "hf_llm", False):
+                    # If it's an llm, we want to optimize model.forward, and use
+                    # the generate function
+                    model = optimize_ctx(model)
+                    optimized_model_iter_fn = self.model_iter_fn
+                else:
+                    optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
 
             with maybe_snapshot_memory(
                 self.args.snapshot_memory, f"compiled_{self.args.only}"
@@ -2793,7 +2831,13 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                     f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
                 )
 
-            results.append(experiment(model, example_inputs, **experiment_kwargs))
+            experiment_kwargs["hf_llm"] = getattr(self, "hf_llm", False)
+
+            results.append(
+                experiment(
+                    self.model_iter_fn, model, example_inputs, **experiment_kwargs
+                )
+            )
             return " ".join(map(str, results))
 
     def minify_model(
@@ -3536,18 +3580,10 @@ def process_caching_precompile():
     )
     from torch._dynamo.precompile_context import PrecompileContext
 
-    # Serialize all callables, clear PrecompileContext
-    # TODO: put this under torch.compiler API once ready
-    serialized = PrecompileContext.serialize()
-    PrecompileContext.clear()
-    if serialized is not None:
-        artifacts, info = serialized
-        print(
-            f"Saving {len(info.precompile_dynamo_artifacts)} Precompile Artifact(s)..."
-        )
-        results = PrecompileContext.deserialize(artifacts)
-        assert results is not None
-        PrecompileContext.populate_caches(results)
+    debug_info = PrecompileContext.save_to_dynamo_cache()
+    print(
+        f"Saved {len(debug_info['dynamo'])} precompile artifacts with {len(debug_info['backends'])} backends"
+    )
 
 
 def process_entry(rank, runner, original_dir, args):
@@ -4084,7 +4120,7 @@ def model_iter_fn_and_mark_step(*args, **kwargs):
         # Overwrite 'translation_validation' config, if specified.
         torch.fx.experimental._config.translation_validation = False
 
-    experiment = functools.partial(experiment, args, runner.model_iter_fn)
+    experiment = functools.partial(experiment, args)
 
     if args.only and should_diff_branch(args):
         import git
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index 76026731fe890..cceb448a849c1 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -7,6 +7,7 @@
 import re
 import subprocess
 import sys
+import types
 import warnings
 
 
@@ -128,6 +129,12 @@ def process_hf_reformer_output(out):
 assert len(BATCH_SIZE_KNOWN_MODELS)
 
 
+try:
+    from .huggingface_llm_models import HF_LLM_MODELS
+except ImportError:
+    from huggingface_llm_models import HF_LLM_MODELS
+
+
 def get_module_cls_by_model_name(model_cls_name):
     _module_by_model_name = {
         "Speech2Text2Decoder": "transformers.models.speech_to_text_2.modeling_speech_to_text_2",
@@ -418,11 +425,8 @@ def load_model(
         use_eval_mode = self.args.use_eval_mode
         dtype = torch.float32
         reset_rng_state()
-        model_cls, config = self._get_model_cls_and_config(model_name)
-        model = self._download_model(model_name)
-        model = model.to(device, dtype=dtype)
-        if self.args.enable_activation_checkpointing:
-            model.gradient_checkpointing_enable()
+
+        # Get batch size
         if model_name in BATCH_SIZE_KNOWN_MODELS:
             batch_size_default = BATCH_SIZE_KNOWN_MODELS[model_name]
         elif batch_size is None:
@@ -440,14 +444,46 @@ def load_model(
                     f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}"  # noqa: G004
                 )
 
-        example_inputs = generate_inputs_for_model(
-            model_cls, model, model_name, batch_size, device, include_loss_args=True
-        )
+        # Get model and example inputs
+        if model_name in HF_LLM_MODELS:
+            benchmark_cls = HF_LLM_MODELS[model_name]
+            model, example_inputs = benchmark_cls.get_model_and_inputs(
+                model_name, device
+            )
+
+            # Set this flag so that when we test for speedup, we use
+            # model.generate instead of using model.forward
+            self.hf_llm = True
+
+            def generate(self, _, example_inputs, collect_outputs=True):
+                return model.generate(**example_inputs)
 
-        # So we can check for correct gradients without eliminating the dropout computation
-        for attr in dir(config):
-            if "drop" in attr and isinstance(getattr(config, attr), float):
-                setattr(config, attr, 1e-30)
+            self.generate = types.MethodType(generate, self)
+
+        else:
+            self.hf_llm = False
+
+            model_cls, config = self._get_model_cls_and_config(model_name)
+            model = self._download_model(model_name)
+            model = model.to(device, dtype=dtype)
+
+            example_inputs = generate_inputs_for_model(
+                model_cls, model, model_name, batch_size, device, include_loss_args=True
+            )
+
+            # So we can check for correct gradients without eliminating the dropout computation
+            for attr in dir(config):
+                if "drop" in attr and isinstance(getattr(config, attr), float):
+                    setattr(config, attr, 1e-30)
+
+            # Turning off kv cache for torchbench models. This is not the right
+            # thing to do, but the pt2 dashboard is outdated. Real transformers
+            # benchmarks will be added soon using a different infra.
+            if hasattr(model, "config") and hasattr(model.config, "use_cache"):
+                model.config.use_cache = False
+
+        if self.args.enable_activation_checkpointing:
+            model.gradient_checkpointing_enable()
 
         if (
             is_training
@@ -460,12 +496,6 @@ def load_model(
         else:
             model.eval()
 
-        # Turning off kv cache for torchbench models. This is not the right
-        # thing to do, but the pt2 dashboard is outdated. Real transformers
-        # benchmarks will be added soon using a different infra.
-        if hasattr(model, "config") and hasattr(model.config, "use_cache"):
-            model.config.use_cache = False
-
         self.validate_model(model, example_inputs)
         return device, model_name, model, example_inputs, batch_size
 
@@ -530,7 +560,8 @@ def compute_loss(self, pred):
 
     def forward_pass(self, mod, inputs, collect_outputs=True):
         with self.autocast(**self.autocast_arg):
-            return mod(**inputs)
+            res = mod(**inputs)
+        return res.logits if self.hf_llm else res
 
     def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         cloned_inputs = clone_inputs(inputs)
diff --git a/benchmarks/dynamo/huggingface.yaml b/benchmarks/dynamo/huggingface.yaml
index 5640776117096..bd10956015073 100644
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@@ -9,9 +9,20 @@ skip:
     # Fails with even batch size = 1
     - GPTJForCausalLM
     - GPTJForQuestionAnswering
+    # Model too big
+    - google/gemma-3-4b-it
+    - openai/gpt-oss-20b
+    - mistralai/Mistral-7B-Instruct-v0.3
 
   device:
-    cpu: []
+    cpu:
+      - meta-llama/Llama-3.2-1B
+      - google/gemma-2-2b
+      - google/gemma-3-4b-it
+      - openai/whisper-tiny
+      - Qwen/Qwen3-0.6B
+      - mistralai/Mistral-7B-Instruct-v0.3
+      - openai/gpt-oss-20b
 
   control_flow:
     - AllenaiLongformerBase
@@ -67,6 +78,13 @@ batch_size:
     XGLMForCausalLM: 4
     XLNetLMHeadModel: 2
     YituTechConvBert: 2
+    meta-llama/Llama-3.2-1B: 8
+    google/gemma-2-2b: 8
+    google/gemma-3-4b-it: 8
+    openai/whisper-tiny: 8
+    Qwen/Qwen3-0.6B: 8
+    mistralai/Mistral-7B-Instruct-v0.3: 8
+    openai/gpt-oss-20b: 8
 
 
 tolerance:
diff --git a/benchmarks/dynamo/huggingface_llm_models.py b/benchmarks/dynamo/huggingface_llm_models.py
new file mode 100644
index 0000000000000..2c68254ebe14e
--- /dev/null
+++ b/benchmarks/dynamo/huggingface_llm_models.py
@@ -0,0 +1,104 @@
+import subprocess
+import sys
+
+import torch
+
+
+def pip_install(package):
+    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
+
+
+try:
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        WhisperForConditionalGeneration,
+        WhisperProcessor,
+    )
+except ModuleNotFoundError:
+    print("Installing HuggingFace Transformers...")
+    pip_install("git+https://github.com/huggingface/transformers.git#egg=transformers")
+finally:
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        WhisperForConditionalGeneration,
+        WhisperProcessor,
+    )
+
+
+class Benchmark:
+    @staticmethod
+    def get_model_and_inputs(model_name, device):
+        raise NotImplementedError("get_model_and_inputs() not implemented")
+
+
+class WhisperBenchmark(Benchmark):
+    SAMPLE_RATE = 16000
+    DURATION = 30.0  # seconds
+
+    @staticmethod
+    def get_model_and_inputs(model_name, device):
+        processor = WhisperProcessor.from_pretrained(model_name)
+        model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
+        model.config.forced_decoder_ids = None
+
+        model.generation_config.do_sample = False
+        model.generation_config.temperature = 0.0
+
+        num_samples = int(WhisperBenchmark.DURATION * WhisperBenchmark.SAMPLE_RATE)
+        audio = torch.randn(num_samples) * 0.1
+        inputs = dict(
+            processor(
+                audio, sampling_rate=WhisperBenchmark.SAMPLE_RATE, return_tensors="pt"
+            )
+        )
+        inputs["input_features"] = inputs["input_features"].to(device)
+
+        decoder_start_token = model.config.decoder_start_token_id
+        inputs["decoder_input_ids"] = torch.tensor(
+            [[decoder_start_token]], device=device
+        )
+
+        return model, inputs
+
+
+class TextGenerationBenchmark(Benchmark):
+    INPUT_LENGTH = 1000
+    OUTPUT_LENGTH = 2000
+
+    @staticmethod
+    def get_model_and_inputs(model_name, device):
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
+        model.eval()
+
+        model.generation_config.do_sample = False
+        model.generation_config.use_cache = True
+        model.generation_config.cache_implementation = "static"
+        model.generation_config.max_new_tokens = TextGenerationBenchmark.OUTPUT_LENGTH
+        model.generation_config.pad_token_id = tokenizer.eos_token_id
+        model.generation_config.temperature = 0.0
+
+        vocab_size = tokenizer.vocab_size
+        input_ids = torch.randint(
+            low=0,
+            high=vocab_size,
+            size=(1, TextGenerationBenchmark.INPUT_LENGTH),
+            device=device,
+            dtype=torch.long,
+        )
+        example_inputs = {"input_ids": input_ids}
+
+        return model, example_inputs
+
+
+HF_LLM_MODELS: dict[str, Benchmark] = {
+    "meta-llama/Llama-3.2-1B": TextGenerationBenchmark,
+    "google/gemma-2-2b": TextGenerationBenchmark,
+    "google/gemma-3-4b-it": TextGenerationBenchmark,
+    "openai/whisper-tiny": WhisperBenchmark,
+    "Qwen/Qwen3-0.6B": TextGenerationBenchmark,
+    "mistralai/Mistral-7B-Instruct-v0.3": TextGenerationBenchmark,
+    "openai/gpt-oss-20b": TextGenerationBenchmark,
+}
diff --git a/benchmarks/dynamo/huggingface_models_list.txt b/benchmarks/dynamo/huggingface_models_list.txt
index 12ceedd5c4ccc..46f319ab0ded4 100644
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@@ -46,3 +46,10 @@ TrOCRForCausalLM,64
 XGLMForCausalLM,32
 XLNetLMHeadModel,16
 YituTechConvBert,32
+meta-llama/Llama-3.2-1B,8
+google/gemma-2-2b,8
+google/gemma-3-4b-it,8
+openai/whisper-tiny,8
+Qwen/Qwen3-0.6B,8
+mistralai/Mistral-7B-Instruct-v0.3, 8
+openai/gpt-oss-20b, 8
diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
index fc11be9ba6528..2ebde03ffea47 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1
 
 
 
-add_loop_inductor,compile_time_instruction_count,30280000000,0.1
+add_loop_inductor,compile_time_instruction_count,29660000000,0.1
 
 
 
@@ -50,27 +50,27 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1
 
 
 
-aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1
+aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1
 
 
 
-aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1
 
 
 
-aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1
+aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1
 
 
 
-aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1
 
 
 
-aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1
+aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1
 
 
 
-aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1
 
 
 
@@ -78,7 +78,7 @@ mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1
 
 
 
-mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1
+mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1
 
 
 
@@ -86,4 +86,4 @@ basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1
 
 
 
-basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1
+basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1
diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml
index 6a15cf33222b2..bf0a1b6c31e85 100644
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@@ -219,9 +219,7 @@ skip:
       - timm_regnet
       - timm_nfnet
 
-    cuda:
-      # Temporary until https://github.com/pytorch/pytorch/issues/162282 is fixed
-      - sam_fast
+    cuda: []
 
   test:
     training:
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index 0b7fcf4e555f8..3caaf3e3a9167 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -373,9 +373,14 @@ def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
         curr_test_total_time = 0
         time_trace = []
         peak_memory = 0
-        sample_input = next(iter(test_case.op_bench.inputs.values()))
-        device = sample_input.device
-        device_module = torch.get_device_module(device.type)
+        input_values = test_case.op_bench.inputs.values()
+        device, device_module = None, None
+        if input_values and isinstance(next(iter(input_values)), torch.Tensor):
+            # The device and device module information are crucial for memory metric calculation,
+            # In case of ops where inputs are integers (not tensor), memory metrics need not be calculated.
+            sample_input = next(iter(input_values))
+            device = sample_input.device
+            device_module = torch.get_device_module(device.type)
         # TODO: add support for cpu memory measurement
         while True:
             if hasattr(device_module, "reset_peak_memory_stats"):
diff --git a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
index 873f14d20127c..9a7b6797e982a 100644
--- a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -1,5 +1,5 @@
 Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
-PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
+PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459
 PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
 PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
 PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449
@@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547
-PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375
 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333
 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664
-PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525
 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458
 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728
@@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619
-PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613
-PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295
 PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189
@@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp
 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763
 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667
 PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333
-PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
\ No newline at end of file
+PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 218fd747301f9..047ed71ad279a 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
 # for targets in subfolders
 ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
 
-C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
+C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
 
 # a dictionary maps third party library name to fbsource and oss target
 THIRD_PARTY_LIBS = {
@@ -1998,7 +1998,21 @@ def define_buck_targets(
                     third_party("sleef_arm"),
                 ],
             }),
-            compiler_flags = get_aten_compiler_flags(),
+            compiler_flags = get_aten_compiler_flags() + select({
+                "DEFAULT": [],
+                "ovr_config//os:android-arm32": [
+                    "-mfpu=vfpv3-d16",
+                    "-march=armv7-a",
+                    "-mthumb",
+                    "-mfpu=neon",
+                ],
+                "ovr_config//os:android-x86_32": [
+                    "-mssse3",
+                ],
+                "ovr_config//os:android-x86_64": [
+                    "-mssse3",
+                ],
+            }),
             exported_preprocessor_flags = get_aten_preprocessor_flags(),
             exported_deps = [
                 ":aten_header",
diff --git a/build_variables.bzl b/build_variables.bzl
index 05b1cfdc7a4b0..01b204458eee0 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -638,10 +638,13 @@ libtorch_nativert_sources = [
     "torch/nativert/kernels/KernelHandlerRegistry.cpp",
     "torch/nativert/kernels/TritonKernel.cpp",
     "torch/nativert/executor/triton/CpuTritonKernelManager.cpp",
+    "torch/nativert/executor/AOTInductorDelegateExecutor.cpp",
+    "torch/nativert/kernels/ETCallDelegateKernel.cpp",
 ]
 
 libtorch_nativert_cuda_sources = [
     "torch/nativert/executor/triton/CudaTritonKernelManager.cpp",
+    "torch/nativert/executor/AOTInductorModelContainerCudaShim.cpp",
 ]
 
 torch_mobile_tracer_sources = [
diff --git a/c10/core/Storage.h b/c10/core/Storage.h
index df86463dc449c..611133e1bcbd8 100644
--- a/c10/core/Storage.h
+++ b/c10/core/Storage.h
@@ -61,6 +61,22 @@ struct C10_API Storage {
             allocator,
             resizable)) {}
 
+  // Creates storage with pre-allocated memory buffer. Allocator is given for
+  // potential future reallocations, however it can be nullptr if the storage
+  // is non-resizable
+  Storage(
+      use_byte_size_t /*use_byte_size*/,
+      SymInt size_bytes,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator = nullptr,
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            StorageImpl::use_byte_size_t(),
+            std::move(size_bytes),
+            std::move(data_ptr),
+            allocator,
+            resizable)) {}
+
  protected:
   explicit Storage(unsafe_borrow_t, const Storage& rhs)
       : storage_impl_(c10::intrusive_ptr<c10::StorageImpl>::reclaim(
diff --git a/c10/core/impl/DeviceGuardImplInterface.cpp b/c10/core/impl/DeviceGuardImplInterface.cpp
index 015bcd3e64fb3..52f6f5e8c13a9 100644
--- a/c10/core/impl/DeviceGuardImplInterface.cpp
+++ b/c10/core/impl/DeviceGuardImplInterface.cpp
@@ -1,4 +1,5 @@
 #include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/FakeGuardImpl.h>
 #include <array>
 
 namespace c10::impl {
@@ -14,4 +15,26 @@ DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
   device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
 }
 
+namespace {
+thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
+    nullptr;
+}
+
+void ensureCUDADeviceGuardSet() {
+  constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);
+
+  const DeviceGuardImplInterface* p =
+      device_guard_impl_registry[cuda_idx].load();
+
+  // A non-null `ptr` indicates that the CUDA guard is already set up,
+  // implying this is using cuda build
+  if (p && p->deviceCount() == 0) {
+    // In following cases, we override CUDA guard interface with a no-op
+    // device guard. When p->deviceCount() == 0, cuda build is enabled, but no
+    // cuda devices available.
+    tls_fake_device_guard = std::make_unique<FakeGuardImpl<DeviceType::CUDA>>();
+    device_guard_impl_registry[cuda_idx].store(tls_fake_device_guard.get());
+  }
+}
+
 } // namespace c10::impl
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index 523e9ad9f45fa..fc8c367f75e8c 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -6,6 +6,7 @@
 #include <c10/util/Exception.h>
 
 // Just for C10_ANONYMOUS_VARIABLE
+#include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Registry.h>
 
 #include <array>
@@ -251,7 +252,7 @@ struct C10_API DeviceGuardImplInterface {
 // for devices that don't actually have a concept of device index.  Prominent
 // examples are CPU and Meta.
 template <DeviceType D>
-struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
+struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
   NoOpDeviceGuardImpl() = default;
   DeviceType type() const override {
     return D;
@@ -371,5 +372,7 @@ inline bool hasDeviceGuardImpl(DeviceType type) {
   return device_guard_impl_registry[static_cast<size_t>(type)].load();
 }
 
+void C10_API ensureCUDADeviceGuardSet();
+
 } // namespace impl
 } // namespace c10
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 93ac4f7a4c649..1a15495e5bf69 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -504,7 +504,16 @@ struct ExpandableSegment {
   SegmentRange share(SegmentRange range, std::ostream& buf) {
     auto begin = segmentLeft(range.ptr);
     auto end = segmentRight(range.ptr + range.size);
-    ShareHeader header{getpid(), segment_size_, end - begin};
+
+    // header.pid needs to be padded with 4 bytes and initialized with
+    // 0 values ​​to avoid random padding of different bytes each time,
+    // thereby ensuring that the handle can be correctly matched in
+    // ipcMemHandle_to_devptr.
+    ShareHeader header{};
+    header.pid = getpid();
+    header.segment_size = segment_size_;
+    header.num_handles = end - begin;
+
     buf.write((const char*)&header, sizeof(ShareHeader));
     for (auto i : c10::irange(begin, end)) {
       // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
index 457d35f020bbe..4e4419b4369a8 100644
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@@ -10,9 +10,9 @@ namespace c10::cuda {
 
 void c10_cuda_check_implementation(
     const int32_t err,
-    const char* /*filename*/,
-    const char* /*function_name*/,
-    const int /*line_number*/,
+    const char* filename,
+    const char* function_name,
+    const uint32_t line_number,
     const bool include_device_assertions) {
   const auto cuda_error = static_cast<cudaError_t>(err);
   const auto cuda_kernel_failure = include_device_assertions
@@ -41,7 +41,7 @@ void c10_cuda_check_implementation(
   }
 #endif
   throw c10::AcceleratorError(
-      {__func__, __FILE__, int32_t(__LINE__)}, err, check_message);
+      {function_name, filename, line_number}, err, check_message);
 }
 
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
index 899d85e8a73f6..2503b22e4765b 100644
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@@ -91,7 +91,7 @@ C10_CUDA_API void c10_cuda_check_implementation(
     const int32_t err,
     const char* filename,
     const char* function_name,
-    const int line_number,
+    const uint32_t line_number,
     const bool include_device_assertions);
 
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 9839e4e72049e..422652bb021b1 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -78,7 +78,7 @@ int device_count_impl(bool fail_if_no_driver) {
           "would like to use GPUs, turn off ASAN.");
       break;
 #endif // C10_ASAN_ENABLED
-#if _WIN32 && CUDA_VERSION >= 13000
+#if defined(_WIN32) && CUDA_VERSION >= 13000
     // Workaround for CUDA-13.0 error handling on Windows, see
     // https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585
     case cudaErrorNotSupported:
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index b5f313e419db2..f6ad1a4be0261 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -14,7 +14,6 @@ namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;
 
-#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
 // CUDA device allocator that uses cudaMallocAsync to implement
 // the same interface as CUDACachingAllocator.cpp.
 
@@ -926,13 +925,4 @@ CUDAAllocator* allocator() {
   return &device_allocator;
 }
 
-#else
-// NOLINTNEXTLINE(misc-use-internal-linkage)
-CUDAAllocator* allocator() {
-  TORCH_CHECK(false, "Cannot use CudaMallocAsyncAllocator with cuda < 11.4.");
-  return nullptr;
-}
-
-#endif
-
 } // namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync
diff --git a/c10/ovrsource_defs.bzl b/c10/ovrsource_defs.bzl
index aafe5a4de8c42..532404f21bbaf 100644
--- a/c10/ovrsource_defs.bzl
+++ b/c10/ovrsource_defs.bzl
@@ -18,9 +18,9 @@ cuda_supported_platforms = [
 
 def define_c10_ovrsource(name, is_mobile):
     if is_mobile:
-        pp_flags = ["-DC10_MOBILE=1"]
+        pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
     else:
-        pp_flags = []
+        pp_flags = ["-DC10_USE_GLOG"]
 
     oxx_static_library(
         name = name,
diff --git a/c10/util/ExclusivelyOwnedTensorTraits.h b/c10/util/ExclusivelyOwnedTensorTraits.h
index 73ff45b8c38d8..f19df3089f77a 100644
--- a/c10/util/ExclusivelyOwnedTensorTraits.h
+++ b/c10/util/ExclusivelyOwnedTensorTraits.h
@@ -35,26 +35,26 @@ struct ExclusivelyOwnedTensorTraits {
     // incremented.
     const bool isUndefined = toDestroy == UndefinedTensorImpl::singleton();
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        toDestroy->refcount_ == 1 || (toDestroy->refcount_ == 0 && isUndefined),
+        toDestroy->refcount() == 1 ||
+            (toDestroy->refcount() == 0 && isUndefined),
         "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
         isUndefined,
         " and refcount ",
-        toDestroy->refcount_,
+        toDestroy->refcount(),
         ", expected 1 or, if isUndefined, 0!");
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        toDestroy->weakcount_ == 1 ||
-            (toDestroy->weakcount_ == 0 &&
+        toDestroy->weakcount() == 1 ||
+            (toDestroy->weakcount() == 0 &&
              toDestroy == UndefinedTensorImpl::singleton()),
         "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
         isUndefined,
         " and weakcount ",
-        toDestroy->weakcount_,
+        toDestroy->weakcount(),
         ", expected 1 or, if isUndefined, 0!");
     if (!isUndefined) {
 #ifndef NDEBUG
       // Needed to pass the debug assertions in ~intrusive_ptr_target.
-      toDestroy->refcount_ = 0;
-      toDestroy->weakcount_ = 0;
+      toDestroy->combined_refcount_.store(0, std::memory_order_relaxed);
 #endif
       delete toDestroy;
     }
diff --git a/c10/util/FileSystem.h b/c10/util/FileSystem.h
new file mode 100644
index 0000000000000..bc6dddec66a72
--- /dev/null
+++ b/c10/util/FileSystem.h
@@ -0,0 +1,22 @@
+// Shim header for filesystem for compilers that are too old to have it not
+// in the experimental namespace
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+#else
+#error "Neither <filesystem> nor <experimental/filesystem> is available."
+#endif
+
+namespace c10 {
+
+#if __has_include(<filesystem>)
+// NOLINTNEXTLINE(misc-unused-alias-decls)
+namespace filesystem = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+// NOLINTNEXTLINE(misc-unused-alias-decls)
+namespace filesystem = std::experimental::filesystem;
+#endif
+
+} // namespace c10
diff --git a/c10/util/TypeCast.h b/c10/util/TypeCast.h
index 3291fce2c41bb..d8a92c2eaa8c2 100644
--- a/c10/util/TypeCast.h
+++ b/c10/util/TypeCast.h
@@ -52,7 +52,7 @@ struct maybe_bool {
 template <typename src_t>
 struct maybe_bool<true, src_t> {
   C10_HOST_DEVICE static inline decltype(auto) apply(src_t src) {
-    // Don't use bool operator so as to to also compile for ComplexHalf.
+    // Don't use bool operator so as to also compile for ComplexHalf.
     return src.real() || src.imag();
   }
 };
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 288b19df0a6c8..1f89b2799ad68 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -27,7 +27,78 @@ struct DontIncreaseRefcount {};
 } // namespace raw
 
 namespace detail {
-constexpr uint32_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+constexpr uint64_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
+    (kImpracticallyHugeReferenceCount << 32);
+constexpr uint64_t kReferenceCountOne = 1;
+constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
+constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+
+template <class TTarget>
+struct intrusive_target_default_null_type final {
+  static constexpr TTarget* singleton() noexcept {
+    return nullptr;
+  }
+};
+
+template <class TTarget, class ToNullType, class FromNullType>
+TTarget* assign_ptr_(TTarget* rhs) {
+  if (FromNullType::singleton() == rhs) {
+    return ToNullType::singleton();
+  } else {
+    return rhs;
+  }
+}
+
+inline uint32_t refcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>(combined_refcount);
+}
+
+inline uint32_t weakcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>(combined_refcount >> 32);
+}
+
+// The only requirement for refcount increment is that it happens-before
+// decrement, so no additional memory ordering is needed.
+inline uint64_t atomic_combined_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t inc) {
+  return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
+}
+
+inline uint32_t atomic_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::refcount(atomic_combined_refcount_increment(
+      combined_refcount, kReferenceCountOne));
+}
+
+inline uint32_t atomic_weakcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_increment(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
+// The requirement is that all modifications to the managed object happen-before
+// invocation of the managed object destructor, and that allocation of the
+// managed object storage happens-before deallocation of the storage.
+//
+// To get this ordering, all non-final decrements must synchronize-with the
+// final decrement. So all non-final decrements have to store-release while the
+// final decrement has to load-acquire, either directly or with the help of
+// fences. But it's easiest just to have all decrements be acq-rel. And it turns
+// out, on modern architectures and chips, it's also fastest.
+inline uint64_t atomic_combined_refcount_decrement(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t dec) {
+  return combined_refcount.fetch_sub(dec, std::memory_order_acq_rel) - dec;
+}
+
+inline uint32_t atomic_weakcount_decrement(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_decrement(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
 } // namespace detail
 
 /**
@@ -80,8 +151,14 @@ class C10_API intrusive_ptr_target {
   //    atomically increment the use count, if it is greater than 0.
   //    If it is not, you must report that the storage is dead.
   //
-  mutable std::atomic<uint32_t> refcount_;
-  mutable std::atomic<uint32_t> weakcount_;
+  //.We use a single combined count for refcount and weakcount so that
+  // we can atomically operate on both at the same time for performance
+  // and defined behaviors.
+  //
+  mutable std::atomic<uint64_t> combined_refcount_;
+  static_assert(sizeof(std::atomic<uint64_t>) == 8);
+  static_assert(alignof(std::atomic<uint64_t>) == 8);
+  static_assert(std::atomic<uint64_t>::is_always_lock_free);
 
   template <typename T, typename NullType>
   friend class intrusive_ptr;
@@ -126,16 +203,16 @@ class C10_API intrusive_ptr_target {
         // caller of unsafe_adapt_non_heap_allocated wanted to
         // use). We choose our reference count such that the count
         // will not dip below kImpracticallyHugeReferenceCount regardless.
-        refcount_.load() == 0 ||
-            refcount_.load() >= detail::kImpracticallyHugeReferenceCount,
+        refcount() == 0 ||
+            refcount() >= detail::kImpracticallyHugeReferenceCount,
         "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ",
-        refcount_.load());
+        refcount());
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         // See ~intrusive_ptr for optimization that will frequently result in 1
         // at destruction time.
-        weakcount_.load() == 1 || weakcount_.load() == 0 ||
-            weakcount_.load() == detail::kImpracticallyHugeReferenceCount - 1 ||
-            weakcount_.load() == detail::kImpracticallyHugeReferenceCount,
+        weakcount() == 1 || weakcount() == 0 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount - 1 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount,
         "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
@@ -144,7 +221,7 @@ class C10_API intrusive_ptr_target {
 #endif
   }
 
-  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
+  constexpr intrusive_ptr_target() noexcept : combined_refcount_(0) {}
 
   // intrusive_ptr_target supports copy and move: but refcount and weakcount
   // don't participate (since they are intrinsic properties of the memory
@@ -177,48 +254,16 @@ class C10_API intrusive_ptr_target {
    * destructed), this function WILL NOT be called.
    */
   virtual void release_resources() {}
-};
 
-namespace detail {
-template <class TTarget>
-struct intrusive_target_default_null_type final {
-  static constexpr TTarget* singleton() noexcept {
-    return nullptr;
+  uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
+    return detail::refcount(combined_refcount_.load(order));
   }
-};
 
-template <class TTarget, class ToNullType, class FromNullType>
-TTarget* assign_ptr_(TTarget* rhs) {
-  if (FromNullType::singleton() == rhs) {
-    return ToNullType::singleton();
-  } else {
-    return rhs;
+  uint32_t weakcount(
+      std::memory_order order = std::memory_order_relaxed) const {
+    return detail::weakcount(combined_refcount_.load(order));
   }
-}
-
-// Increment needs to be acquire-release to make use_count() and
-// unique() reliable.
-inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
-  return refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-}
-
-// weak_use_count() is only used for testing, so we don't need it to
-// be reliable. Relaxed should be fine.
-inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
-  return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
-}
-
-// Both decrements need to be acquire-release for correctness. See
-// e.g. std::shared_ptr implementation.
-inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
-  return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
-}
-
-inline uint32_t atomic_weakcount_decrement(std::atomic<uint32_t>& weakcount) {
-  return weakcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
-}
-
-} // namespace detail
+};
 
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;
@@ -270,7 +315,7 @@ class intrusive_ptr final {
   void retain_() {
     if (target_ != NullType::singleton()) {
       uint32_t new_refcount =
-          detail::atomic_refcount_increment(target_->refcount_);
+          detail::atomic_refcount_increment(target_->combined_refcount_);
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_refcount != 1,
           "intrusive_ptr: Cannot increase refcount after it reached zero.");
@@ -278,23 +323,39 @@ class intrusive_ptr final {
   }
 
   void reset_() noexcept {
-    if (target_ != NullType::singleton() &&
-        detail::atomic_refcount_decrement(target_->refcount_) == 0) {
-      // See comment above about weakcount. As long as refcount>0,
-      // weakcount is one larger than the actual number of weak references.
-      // So we need to decrement it here.
-      bool should_delete =
-          target_->weakcount_.load(std::memory_order_acquire) == 1;
-      if (!should_delete) {
-        // justification for const_cast: release_resources is basically a
-        // destructor and a destructor always mutates the object, even for const
-        // objects. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-        const_cast<std::remove_const_t<TTarget>*>(target_)->release_resources();
-        should_delete =
-            detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
-      }
-      if (should_delete) {
+    if (target_ != NullType::singleton()) {
+      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
+          detail::kUniqueRef) {
+        // Both counts are 1, so there are no weak references and
+        // we are releasing the last strong reference. No other
+        // threads can observe the effects of this target_ deletion
+        // call (e.g. calling use_count()) without a data race.
+        target_->combined_refcount_.store(0, std::memory_order_relaxed);
         delete target_;
+        return;
+      }
+
+      auto combined_refcount = detail::atomic_combined_refcount_decrement(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      if (detail::refcount(combined_refcount) == 0) {
+        bool should_delete =
+            (combined_refcount == detail::kWeakReferenceCountOne);
+        // See comment above about weakcount. As long as refcount>0,
+        // weakcount is one larger than the actual number of weak references.
+        // So we need to decrement it here.
+        if (!should_delete) {
+          // justification for const_cast: release_resources is basically a
+          // destructor and a destructor always mutates the object, even for
+          // const objects.
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<std::remove_const_t<TTarget>*>(target_)
+              ->release_resources();
+          should_delete = detail::atomic_weakcount_decrement(
+                              target_->combined_refcount_) == 0;
+        }
+        if (should_delete) {
+          delete target_;
+        }
       }
     }
   }
@@ -317,12 +378,12 @@ class intrusive_ptr final {
       // `mov`, whereas an atomic increment does a lock-prefixed `add`, which is
       // much more expensive: https://godbolt.org/z/eKPzj8.)
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-          target_->refcount_ == 0 && target_->weakcount_ == 0,
+          target_->combined_refcount_.load(std::memory_order_relaxed) == 0,
           "intrusive_ptr: Newly-created target had non-zero refcounts. Does its "
           "constructor do something strange like incref or create an "
           "intrusive_ptr from `this`?");
-      target_->refcount_.store(1, std::memory_order_relaxed);
-      target_->weakcount_.store(1, std::memory_order_relaxed);
+      target_->combined_refcount_.store(
+          detail::kUniqueRef, std::memory_order_relaxed);
     }
   }
 
@@ -332,7 +393,7 @@ class intrusive_ptr final {
   intrusive_ptr() noexcept
       : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
 
-  intrusive_ptr(std::nullptr_t) noexcept
+  /* implicit */ intrusive_ptr(std::nullptr_t) noexcept
       : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
 
   // This constructor will not increase the ref counter for you.
@@ -445,14 +506,14 @@ class intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return 0;
     }
-    return target_->refcount_.load(std::memory_order_acquire);
+    return target_->refcount(std::memory_order_relaxed);
   }
 
   uint32_t weak_use_count() const noexcept {
     if (target_ == NullType::singleton()) {
       return 0;
     }
-    return target_->weakcount_.load(std::memory_order_acquire);
+    return target_->weakcount(std::memory_order_relaxed);
   }
 
   bool unique() const noexcept {
@@ -481,8 +542,8 @@ class intrusive_ptr final {
    */
   static intrusive_ptr reclaim(TTarget* owning_ptr) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        owning_ptr == NullType::singleton() ||
-            owning_ptr->refcount_.load() == 0 || owning_ptr->weakcount_.load(),
+        owning_ptr == NullType::singleton() || owning_ptr->refcount() == 0 ||
+            owning_ptr->weakcount(),
         "TTarget violates the invariant that refcount > 0  =>  weakcount > 0");
     return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{});
   }
@@ -553,11 +614,11 @@ class intrusive_ptr final {
 #ifdef NDEBUG
     expected_decrefs = 0;
 #endif
-    result.target_->refcount_.store(
-        detail::kImpracticallyHugeReferenceCount + expected_decrefs,
+    result.target_->combined_refcount_.store(
+        detail::refcount(
+            detail::kImpracticallyHugeReferenceCount + expected_decrefs) |
+            detail::kImpracticallyHugeWeakReferenceCount,
         std::memory_order_relaxed);
-    result.target_->weakcount_.store(
-        detail::kImpracticallyHugeReferenceCount, std::memory_order_relaxed);
     return result;
   }
 
@@ -574,7 +635,7 @@ class intrusive_ptr final {
   static intrusive_ptr unsafe_reclaim_from_nonowning(TTarget* raw_ptr) {
     // See Note [Stack allocated intrusive_ptr_target safety]
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        raw_ptr == NullType::singleton() || raw_ptr->refcount_.load() > 0,
+        raw_ptr == NullType::singleton() || raw_ptr->refcount() > 0,
         "intrusive_ptr: Can only reclaim pointers that are owned by someone");
     auto ptr = reclaim(raw_ptr); // doesn't increase refcount
     ptr.retain_();
@@ -708,7 +769,7 @@ class weak_intrusive_ptr final {
   void retain_() {
     if (target_ != NullType::singleton()) {
       uint32_t new_weakcount =
-          detail::atomic_weakcount_increment(target_->weakcount_);
+          detail::atomic_weakcount_increment(target_->combined_refcount_);
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_weakcount != 1,
           "weak_intrusive_ptr: Cannot increase weakcount after it reached zero.");
@@ -717,7 +778,7 @@ class weak_intrusive_ptr final {
 
   void reset_() noexcept {
     if (target_ != NullType::singleton() &&
-        detail::atomic_weakcount_decrement(target_->weakcount_) == 0) {
+        detail::atomic_weakcount_decrement(target_->combined_refcount_) == 0) {
       // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDelete)
       delete target_;
     }
@@ -850,15 +911,15 @@ class weak_intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return 0;
     }
-    return target_->refcount_.load(
-        std::memory_order_acquire); // refcount, not weakcount!
+    return target_->refcount(
+        std::memory_order_relaxed); // refcount, not weakcount!
   }
 
   uint32_t weak_use_count() const noexcept {
     if (target_ == NullType::singleton()) {
       return 0;
     }
-    return target_->weakcount_.load(std::memory_order_acquire);
+    return target_->weakcount(std::memory_order_relaxed);
   }
 
   bool expired() const noexcept {
@@ -866,18 +927,23 @@ class weak_intrusive_ptr final {
   }
 
   intrusive_ptr<TTarget, NullType> lock() const noexcept {
-    if (expired()) {
+    if (target_ == NullType::singleton()) {
       return intrusive_ptr<TTarget, NullType>();
     } else {
-      auto refcount = target_->refcount_.load(std::memory_order_seq_cst);
+      auto combined_refcount =
+          target_->combined_refcount_.load(std::memory_order_relaxed);
       do {
-        if (refcount == 0) {
+        if (detail::refcount(combined_refcount) == 0) {
           // Object already destructed, no strong references left anymore.
           // Return nullptr.
           return intrusive_ptr<TTarget, NullType>();
         }
-      } while (
-          !target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
+      } while (!target_->combined_refcount_.compare_exchange_weak(
+          combined_refcount,
+          combined_refcount + detail::kReferenceCountOne,
+          std::memory_order_acquire,
+          std::memory_order_relaxed));
+
       return intrusive_ptr<TTarget, NullType>(
           target_, raw::DontIncreaseRefcount{});
     }
@@ -911,9 +977,9 @@ class weak_intrusive_ptr final {
     // if refcount == 0, weakcount only must be >0.
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         owning_weak_ptr == NullType::singleton() ||
-            owning_weak_ptr->weakcount_.load() > 1 ||
-            (owning_weak_ptr->refcount_.load() == 0 &&
-             owning_weak_ptr->weakcount_.load() > 0),
+            owning_weak_ptr->weakcount() > 1 ||
+            (owning_weak_ptr->refcount() == 0 &&
+             owning_weak_ptr->weakcount() > 0),
         "weak_intrusive_ptr: Can only weak_intrusive_ptr::reclaim() owning pointers that were created using weak_intrusive_ptr::release().");
     return weak_intrusive_ptr(owning_weak_ptr);
   }
@@ -992,7 +1058,7 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
   if (self) {
-    detail::atomic_refcount_increment(self->refcount_);
+    detail::atomic_refcount_increment(self->combined_refcount_);
   }
 }
 
@@ -1026,7 +1092,7 @@ inline uint32_t use_count(intrusive_ptr_target* self) {
 namespace weak_intrusive_ptr {
 
 inline void incref(weak_intrusive_ptr_target* self) {
-  detail::atomic_weakcount_increment(self->weakcount_);
+  detail::atomic_weakcount_increment(self->combined_refcount_);
 }
 
 inline void decref(weak_intrusive_ptr_target* self) {
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4cd773bc16123..51e4023b0d18e 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -550,6 +550,13 @@ if(USE_CUDA OR USE_ROCM)
   append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
 endif()
 
+if(USE_CUDA)
+  append_filelist("libtorch_nativert_cuda_sources" Caffe2_GPU_SRCS)
+endif()
+if(USE_ROCM)
+  append_filelist("libtorch_nativert_cuda_sources" Caffe2_HIP_SRCS)
+endif()
+
 if(USE_CUDA)
   list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
   add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
diff --git a/caffe2/perfkernels/batch_box_cox_sve128.cc b/caffe2/perfkernels/batch_box_cox_sve128.cc
new file mode 100644
index 0000000000000..2b1dfc79dfe2f
--- /dev/null
+++ b/caffe2/perfkernels/batch_box_cox_sve128.cc
@@ -0,0 +1,249 @@
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(CAFFE2_PERF_WITH_SVE128)
+#include <arm_neon.h>
+#include <arm_neon_sve_bridge.h>
+#include <arm_sve.h>
+
+#include "c10/macros/Macros.h"
+
+// Log and exp approximations inspired from ACL implementation
+
+inline float32x4_t vtaylor_polyq_for_log_f32(float32x4_t x) {
+  const float32x4_t log_tab_1 = vdupq_n_f32(-2.29561495781f);
+  const float32x4_t log_tab_2 = vdupq_n_f32(-2.47071170807f);
+  const float32x4_t log_tab_3 = vdupq_n_f32(-5.68692588806f);
+  const float32x4_t log_tab_4 = vdupq_n_f32(-0.165253549814f);
+  const float32x4_t log_tab_5 = vdupq_n_f32(5.17591238022f);
+  const float32x4_t log_tab_6 = vdupq_n_f32(0.844007015228f);
+  const float32x4_t log_tab_7 = vdupq_n_f32(4.58445882797f);
+  const float32x4_t log_tab_8 = vdupq_n_f32(0.0141278216615f);
+
+  float32x4_t A = vmlaq_f32(log_tab_1, log_tab_5, x);
+  float32x4_t B = vmlaq_f32(log_tab_3, log_tab_7, x);
+  float32x4_t C = vmlaq_f32(log_tab_2, log_tab_6, x);
+  float32x4_t x2 = vmulq_f32(x, x);
+  float32x4_t D = svget_neonq(svmad_f32_x(
+      svptrue_b8(),
+      svset_neonq(svundef_f32(), x),
+      svset_neonq(svundef_f32(), log_tab_8),
+      svset_neonq(svundef_f32(), log_tab_4)));
+  float32x4_t x4 = vmulq_f32(x2, x2);
+  float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
+  return res;
+}
+
+inline float32x4_t vlogq_f32(float32x4_t x) {
+  const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+
+  // Extract exponent
+  int32x4_t m = svget_neonq(svsub_n_s32_x(
+      svptrue_b8(),
+      svset_neonq(
+          svundef_s32(),
+          vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23))),
+      127));
+  float32x4_t val = vreinterpretq_f32_s32(
+      vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
+  // Polynomial Approximation
+  float32x4_t poly = vtaylor_polyq_for_log_f32(val);
+
+  // Reconstruct
+  poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+
+  return poly;
+}
+
+inline float32x4_t vexpq_f32(float32x4_t x) {
+  const auto c1 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3f7ffff6)));
+  const auto c2 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3efffedb)));
+  const auto c3 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3e2aaf33)));
+  const auto c4 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3d2b9f17)));
+  const auto c5 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3c072010)));
+
+  const auto shift = vreinterpretq_f32_u32(
+      svget_neonq(svdup_n_u32(0x4b00007f))); // 2^23 + 127 = 0x1.0000fep23f
+  const auto inv_ln2 = vreinterpretq_f32_u32(
+      svget_neonq(svdup_n_u32(0x3fb8aa3b))); // 1 / ln(2) = 0x1.715476p+0f
+  const auto neg_ln2_hi = vreinterpretq_f32_u32(svget_neonq(
+      svdup_n_u32(0xbf317200))); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+  const auto neg_ln2_lo = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(
+      0xb5bfbe8e))); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+
+  const auto inf = svdup_n_f32(std::numeric_limits<float>::infinity());
+  const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
+  const auto zero = svdup_n_f32(0.f);
+  const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
+
+  // Range reduction:
+  //   e^x = 2^n * e^r
+  // where:
+  //   n = floor(x / ln(2))
+  //   r = x - n * ln(2)
+  //
+  // By adding x / ln(2) with 2^23 + 127 (shift):
+  //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127
+  //   forces decimal part
+  //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n)
+  //     + 127 will occupy the whole fraction part of z in FP32 format.
+  //     Subtracting 2^23 + 127 (shift) from z will result in the integer part
+  //     of x / ln(2) (i.e. n) because the decimal part has been pushed out and
+  //     lost.
+  //   * The addition of 127 makes the FP32 fraction part of z ready to be used
+  //   as the exponent
+  //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
+  const auto z = vfmaq_f32(shift, x, inv_ln2);
+  const auto n = z - shift;
+  const auto scale =
+      vreinterpretq_f32_u32(vreinterpretq_u32_f32(z) << 23); // 2^n
+
+  // The calculation of n * ln(2) is done using 2 steps to achieve accuracy
+  // beyond FP32. This outperforms longer Taylor series (3-4 tabs) both in term
+  // of accuracy and performance.
+  const auto r_hi = vfmaq_f32(x, n, neg_ln2_hi);
+  const auto r = vfmaq_f32(r_hi, n, neg_ln2_lo);
+
+  // Compute the truncated Taylor series of e^r.
+  //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
+  const auto r2 = r * r;
+
+  const auto p1 = c1 * r;
+  const auto p23 = vfmaq_f32(c2, c3, r);
+  const auto p45 = vfmaq_f32(c4, c5, r);
+  const auto p2345 = vfmaq_f32(p23, p45, r2);
+  const auto p12345 = vfmaq_f32(p1, p2345, r2);
+
+  auto poly = svset_neonq(svundef_f32(), vfmaq_f32(scale, p12345, scale));
+
+  auto pHigh = svcmpgt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), max_input);
+  auto pLow = svcmplt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), min_input);
+
+  auto bound = svsel_f32(
+      pHigh,
+      inf,
+      zero);
+
+  auto pCombined = svorr_b_z(svptrue_b8(), pLow, pHigh);
+
+  // Handle underflow and overflow.
+  poly = svsel_f32(
+      pCombined,
+      bound,
+      poly);
+
+  return svget_neonq(poly);
+}
+
+// ln(x) = log2(x) * ln(2)
+// pow(x, n) = exp(n * ln(x))
+inline float32x4_t compute_batch_box_cox_vec_sve128_float(
+    svfloat32_t lambda1_v,
+    svfloat32_t lambda2_v,
+    svfloat32_t data_v,
+    svfloat32_t k_eps) {
+  // sum_v = lambda2_v + data_v
+  float32x4_t sum_v = vaddq_f32(svget_neonq(data_v), svget_neonq(lambda2_v));
+
+  // test lambda1_v: predNZ == 1 iff lambda1_v != 0
+  svbool_t predNZ = svcmpne_n_f32(svptrue_b8(), lambda1_v, 0.0f);
+
+  // clamp sum_v: sum_v = max(sum_v, k_eps)
+  sum_v = vmaxq_f32(sum_v, svget_neonq(k_eps));
+
+  // lnData = log(sum_v)
+  svfloat32_t lnData = svset_neonq(svundef_f32(), vlogq_f32(sum_v));
+
+  // if any lambda1 != 0, compute pow(sum_v, lambda1) using lnData
+  // pow(sum_v, lambda1) == exp(lambda1 * ln(sum_v))
+  if (C10_LIKELY(svptest_any(predNZ, predNZ))) {
+    // mult = lambda1 * ln(sum_v)
+    float32x4_t mult = vmulq_f32(svget_neonq(lnData), svget_neonq(lambda1_v));
+
+    // lambda1_r = 1 / lambda1
+    svfloat32_t lambda1_r = svdivr_f32_m(predNZ, lambda1_v, svdup_n_f32(1.0f));
+
+    // pow = exp(mult)
+    float32x4_t pow = vexpq_f32(mult);
+
+    // merge results
+    // lnData if lambda1 == 0, (lambda1_r * pow - lambda1_r) if lambda1 != 0
+    lnData = svsel_f32(predNZ, lambda1_r, lnData);
+    lnData =
+        svnmsb_f32_m(predNZ, lnData, svset_neonq(svundef_f32(), pow), lnData);
+  }
+  return svget_neonq(lnData);
+}
+
+template <typename T>
+void compute_batch_box_cox_vec_sve128(
+    std::size_t N,
+    std::size_t D,
+    const T* data_ptr,
+    const T* __restrict lambda1_ptr,
+    const T* __restrict lambda2_ptr,
+    T* output_ptr);
+
+template <>
+void compute_batch_box_cox_vec_sve128(
+    std::size_t N,
+    std::size_t D,
+    const float* data_ptr,
+    const float* __restrict lambda1_ptr,
+    const float* __restrict lambda2_ptr,
+    float* output_ptr) {
+  svfloat32_t k_eps = svdup_n_f32(static_cast<float>(1e-6));
+
+  std::size_t remainder = D % 4;
+  std::size_t loopBound = D - remainder;
+  svbool_t remainderPred = svwhilelt_b32_u64(0, remainder);
+
+  for (; C10_LIKELY(N > 0); --N) {
+    for (std::size_t j = 0; C10_LIKELY(j != loopBound);
+         j += 4, data_ptr += 4, output_ptr += 4) {
+      svfloat32_t lambda1_v =
+          svset_neonq(svundef_f32(), vld1q_f32(lambda1_ptr + j));
+      svfloat32_t lambda2_v =
+          svset_neonq(svundef_f32(), vld1q_f32(lambda2_ptr + j));
+      svfloat32_t data_v = svset_neonq(svundef_f32(), vld1q_f32(data_ptr));
+      float32x4_t result = compute_batch_box_cox_vec_sve128_float(
+          lambda1_v, lambda2_v, data_v, k_eps);
+      vst1q_f32(output_ptr, result);
+    }
+    if (C10_LIKELY(remainder > 0)) {
+      svfloat32_t lambda1_v = svld1_f32(remainderPred, lambda1_ptr + loopBound);
+      svfloat32_t lambda2_v = svld1_f32(remainderPred, lambda2_ptr + loopBound);
+      svfloat32_t data_v = svld1_f32(remainderPred, data_ptr);
+      float32x4_t result = compute_batch_box_cox_vec_sve128_float(
+          lambda1_v, lambda2_v, data_v, k_eps);
+      svst1_f32(remainderPred, output_ptr, svset_neonq(svundef_f32(), result));
+      data_ptr += remainder;
+      output_ptr += remainder;
+    }
+  }
+}
+
+namespace caffe2::details {
+
+template <typename T>
+void compute_batch_box_cox__sve128(
+    std::size_t N,
+    std::size_t D,
+    const T* self_data,
+    const T* __restrict lambda1_data,
+    const T* __restrict lambda2_data,
+    T* output_data) {
+  compute_batch_box_cox_vec_sve128<T>(
+      N, D, self_data, lambda1_data, lambda2_data, output_data);
+}
+
+// Vectorized version specializations for float and double
+template void compute_batch_box_cox__sve128<float>(
+    std::size_t N,
+    std::size_t D,
+    const float* self_data,
+    const float* __restrict lambda1_data,
+    const float* __restrict lambda2_data,
+    float* output_data);
+
+} // namespace caffe2::details
+
+#endif // __aarch64__ && __ARM_FEATURE_SVE && CAFFE2_PERF_WITH_SVE128
diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index e39a78c62dd54..015c480cf04f0 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -27,6 +27,10 @@
 #include "caffe2/serialize/versions.h"
 #include "miniz.h"
 
+#ifdef _WIN32
+#include <Windows.h>
+#endif // _WIN32
+
 namespace caffe2 {
 namespace serialize {
 constexpr std::string_view kDebugPklSuffix(".debug_pkl");
@@ -711,21 +715,35 @@ void PyTorchStreamWriter::setup(const string& file_name) {
   if (archive_name_.size() == 0) {
     CAFFE_THROW("invalid file name: ", file_name);
   }
+
+  const std::string dir_name = parentdir(file_name);
+  if (!dir_name.empty()) {
+    struct stat st;
+    bool dir_exists =
+        (stat(dir_name.c_str(), &st) == 0 && (st.st_mode & S_IFDIR));
+    TORCH_CHECK(
+        dir_exists, "Parent directory ", dir_name, " does not exist.");
+  }
+  TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened.");
+
   if (!writer_func_) {
-    file_stream_.open(
-        file_name,
-        std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
     valid("opening archive ", file_name.c_str());
-
-    const std::string dir_name = parentdir(file_name);
-    if (!dir_name.empty()) {
-      struct stat st;
-      bool dir_exists =
-          (stat(dir_name.c_str(), &st) == 0 && (st.st_mode & S_IFDIR));
-      TORCH_CHECK(
-          dir_exists, "Parent directory ", dir_name, " does not exist.");
+    try {
+      file_stream_.exceptions(std::ios_base::failbit | std::ios_base::badbit);
+      file_stream_.open(
+          file_name,
+          std::ofstream::out | std::ofstream::trunc | std::ofstream::binary
+        );
+    } catch (const std::ios_base::failure& e) {
+#ifdef _WIN32
+      // Windows have verbose error code, we prefer to use it than std errno.
+      uint32_t error_code = GetLastError();
+      CAFFE_THROW("open file failed with error code: ", error_code);
+#else // !_WIN32
+      CAFFE_THROW("open file failed with strerror: ", strerror(errno));
+#endif // _WIN32
     }
-    TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened.");
+
     writer_func_ = [this](const void* buf, size_t nbytes) -> size_t {
       if (!buf) {
         // See [Note: write_record_metadata]
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index e4973c849a18f..55d03b7c46320 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -107,6 +107,12 @@ if(INTERN_BUILD_ATEN_OPS)
             list(APPEND _file_compile_flags "-gencode;arch=compute_100a,code=sm_100a")
           endif()
         endif()
+        # We will need to gate against CUDA version, because sm_103a is available on CUDA 12.9+
+        if("${_arch}" STREQUAL "103a" AND CUDA_VERSION VERSION_GREATER_EQUAL 12.9)
+          if(_existing_arch_flags MATCHES ".*compute_100.*")
+            list(APPEND _file_compile_flags "-gencode;arch=compute_103a,code=sm_103a")
+          endif()
+        endif()
         if("${_arch}" STREQUAL "120a")
           if(_existing_arch_flags MATCHES ".*compute_120.*")
             list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
@@ -120,13 +126,13 @@ if(INTERN_BUILD_ATEN_OPS)
 
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
-      "89;90a;100a;120a")
+      "89;90a;100a;103a;120a")
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
       "90a")
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/GroupMM.cu"
-      "90a;100a")
+      "90a;100a;103a")
 
   endif()
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 08ffdaf8cf451..67ce3840c6a99 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -821,9 +821,9 @@ if(NOT Python_Interpreter_FOUND)
   message(FATAL_ERROR "Python3 could not be found.")
 endif()
 
-if(${Python_VERSION} VERSION_LESS 3.9)
+if(${Python_VERSION} VERSION_LESS 3.10)
   message(FATAL_ERROR
-    "Found Python libraries version ${Python_VERSION}. Python < 3.9 is no longer supported by PyTorch.")
+    "Found Python libraries version ${Python_VERSION}. Python < 3.10 is no longer supported by PyTorch.")
 endif()
 
 # ---[ Python + Numpy
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index 5d91587746540..f09f77bedb80f 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -45,13 +45,89 @@ if(NOT __AOTRITON_INCLUDED)
      )
   set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/")  # @lint-ignore
   set(__AOTRITON_Z "gz")
+  # Set the default __AOTRITON_LIB path
+  if(NOT WIN32)
+    set(__AOTRITON_LIB "lib/libaotriton_v2.so")
+  else()
+    set(__AOTRITON_LIB "lib/aotriton_v2.lib")
+  endif()
+
+  function(aotriton_build_windows_dependencies dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
+    # Windows-specific dependencies - build these first
+    if(NOT noimage)
+      message(FATAL_ERROR "noimage must be ON for Windows builds")
+    endif()
+    # Build dlfcn-win32
+    set(__DLFCN_WIN32_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32")
+    set(__DLFCN_WIN32_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32-install")
+
+    ExternalProject_Add(${dlfcn-win32_external}
+      GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
+      GIT_TAG v1.4.2
+      PREFIX ${__DLFCN_WIN32_PREFIX}
+      INSTALL_DIR ${__DLFCN_WIN32_INSTALL_DIR}
+      CMAKE_ARGS
+        -DCMAKE_INSTALL_PREFIX=${__DLFCN_WIN32_INSTALL_DIR}
+        -DCMAKE_BUILD_TYPE=Release
+        -DCMAKE_C_COMPILER=cl
+        -DCMAKE_CXX_COMPILER=cl
+        -DBUILD_SHARED_LIBS=ON
+        -DBUILD_TESTS=OFF
+      BUILD_BYPRODUCTS
+        "${__DLFCN_WIN32_INSTALL_DIR}/lib/dl.lib"
+        "${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
+    )
+    ExternalProject_Add_Step(${dlfcn-win32_external} copy_to_aotriton
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        "${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
+        "${__AOTRITON_INSTALL_DIR}/lib/"
+      DEPENDEES install
+    )
+    set(${dlfcn-win32_DIR} "${__DLFCN_WIN32_INSTALL_DIR}/share/dlfcn-win32" CACHE PATH "Path to dlfcn-win32 CMake config" FORCE)
+
+    # Build xz/liblzma
+    set(__XZ_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/xz")
+    set(__XZ_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/xz-install")
+
+    ExternalProject_Add(${xz_external}
+      GIT_REPOSITORY https://github.com/tukaani-project/xz.git
+      GIT_TAG v5.8.1
+      PREFIX ${__XZ_PREFIX}
+      INSTALL_DIR ${__XZ_INSTALL_DIR}
+      CMAKE_ARGS
+        -DCMAKE_INSTALL_PREFIX=${__XZ_INSTALL_DIR}
+        -DCMAKE_BUILD_TYPE=Release
+        -DBUILD_SHARED_LIBS=ON
+        -DENABLE_NLS=OFF
+        -DXZ_TOOL_LZMAINFO=OFF
+        -DXZ_TOOL_XZ=OFF
+        -DXZ_TOOL_XZDEC=OFF
+        -DXZ_TOOL_LZMADEC=OFF
+      BUILD_BYPRODUCTS
+        "${__XZ_INSTALL_DIR}/lib/lzma.lib"
+        "${__XZ_INSTALL_DIR}/bin/liblzma.dll"
+    )
+    ExternalProject_Add_Step(${xz_external} copy_to_aotriton
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        "${__XZ_INSTALL_DIR}/bin/liblzma.dll"
+        "${__AOTRITON_INSTALL_DIR}/lib/"
+      DEPENDEES install
+    )
+    set(${liblzma_DIR} "${__XZ_INSTALL_DIR}/lib/cmake/liblzma" CACHE PATH "Path to xz/liblzma CMake config" FORCE)
+  endfunction()
+
   function(aotriton_build_from_source noimage project)
     if(noimage)
       SET(RECURSIVE "OFF")
     else()
       SET(RECURSIVE "ON")
     endif()
+    if(WIN32)
+      message(STATUS "Building AOTriton Windows dependencies")
+      aotriton_build_windows_dependencies(dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
+    endif()
     message(STATUS "PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}")
+
     ExternalProject_Add(${project}
       GIT_REPOSITORY https://github.com/ROCm/aotriton.git
       GIT_SUBMODULES_RECURSE ${RECURSIVE}
@@ -65,12 +141,18 @@ if(NOT __AOTRITON_INCLUDED)
       -DAOTRITON_GPU_BUILD_TIMEOUT=0
       -DAOTRITON_NO_PYTHON=ON
       -DAOTRITON_NOIMAGE_MODE=${noimage}
-      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
+      -DHIP_PLATFORM=amd
+      $<$<BOOL:${WIN32}>:-Ddlfcn-win32_DIR=${dlfcn-win32_DIR}>
+      $<$<BOOL:${WIN32}>:-Dliblzma_DIR=${liblzma_DIR}>
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}"
       USES_TERMINAL_DOWNLOAD TRUE
       USES_TERMINAL_CONFIGURE TRUE
       USES_TERMINAL_BUILD TRUE
       USES_TERMINAL_INSTALL TRUE
     )
+    if(WIN32)
+      add_dependencies(${project} dlfcn-win32_external xz_external)
+    endif()
   endfunction()
 
   set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
@@ -95,7 +177,7 @@ if(NOT __AOTRITON_INCLUDED)
       INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
       "${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
       "${__AOTRITON_INSTALL_DIR}"
-      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}"
     )
     message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
     Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
@@ -111,14 +193,35 @@ if(NOT __AOTRITON_INCLUDED)
     string(CONCAT __AOTRITON_URL
            "${__AOTRITON_BASE_URL}"
            "${__AOTRITON_VER}/${__AOTRITON_FILE}")
+
+    # Set up directories
+    set(__AOTRITON_DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_download-${image})
+    set(__AOTRITON_EXTRACT_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image})
+    set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR})
+    set(__DOWNLOAD_NO_EXTRACT "")
+    set(__BUILD_COMMANDS "")
+
+    # On Windows, we need custom tar extraction with UTF-8 support
+    if(WIN32)
+      set(__DOWNLOAD_NO_EXTRACT "DOWNLOAD_NO_EXTRACT;TRUE")
+      set(__BUILD_COMMANDS
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${__AOTRITON_EXTRACT_DIR}"
+        COMMAND tar --options hdrcharset=UTF-8 -xf "${__AOTRITON_DOWNLOAD_DIR}/${__AOTRITON_FILE}" -C "${__AOTRITON_EXTRACT_DIR}"
+      )
+      set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}/aotriton)
+    endif()
+
     ExternalProject_Add(${project}
       URL "${__AOTRITON_URL}"
       URL_HASH SHA256=${__AOTRITON_SHA256}
-      SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}
+      DOWNLOAD_DIR ${__AOTRITON_DOWNLOAD_DIR}
+      ${__DOWNLOAD_NO_EXTRACT}
+      SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}
       CONFIGURE_COMMAND ""
       BUILD_COMMAND ""
+      ${__BUILD_COMMANDS}
       INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
-      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}"
+      "${__AOTRITON_INSTALL_SOURCE_DIR}"
       "${__AOTRITON_INSTALL_DIR}"
       BUILD_BYPRODUCTS
       "${__AOTRITON_INSTALL_DIR}/lib/aotriton.images/${image}/__signature__"
@@ -164,7 +267,7 @@ if(NOT __AOTRITON_INCLUDED)
       endforeach()
     endforeach()
   endif()
-  target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so)
+  target_link_libraries(__caffe2_aotriton INTERFACE "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}")
   target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
   set(AOTRITON_FOUND TRUE)
 endif() # __AOTRITON_INCLUDED
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index fb64e99bccf22..a0bfb22bed807 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -158,6 +158,7 @@ function(caffe2_print_configuration_summary)
   if(${USE_KLEIDIAI})
     message(STATUS "  USE_KLEIDIAI          : ${USE_KLEIDIAI}")
   endif()
+  message(STATUS "  USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}")
   message(STATUS "  USE_UCC               : ${USE_UCC}")
   if(${USE_UCC})
     message(STATUS "    USE_SYSTEM_UCC        : ${USE_SYSTEM_UCC}")
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index 68e66bb3fc386..c96ffebf858e3 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -482,6 +482,7 @@ function(torch_update_find_cuda_flags)
 endfunction()
 
 include(CheckCXXCompilerFlag)
+include(CheckLinkerFlag)
 
 ##############################################################################
 # CHeck if given flag is supported and append it to provided outputvar
@@ -511,3 +512,22 @@ function(target_compile_options_if_supported target flag)
     target_compile_options(${target} PRIVATE ${flag})
   endif()
 endfunction()
+
+# Check if a global link option is supported
+function(add_link_options_if_supported flag)
+  check_linker_flag(C "LINKER:${flag}" _supported)
+  if("${_supported}")
+    add_link_options("LINKER:${flag}")
+  else()
+    message(WARNING "Attempted to use unsupported link option : ${flag}.")
+  endif()
+endfunction()
+
+function(target_link_options_if_supported tgt flag)
+  check_linker_flag(C "LINKER:${flag}" _supported)
+  if("${_supported}")
+    target_link_options("${tgt}" PRIVATE "LINKER:${flag}")
+  else()
+    message(WARNING "Attempted to use unsupported link option : ${flag}.")
+  endif()
+endfunction()
\ No newline at end of file
diff --git a/docs/cpp/source/conf.py b/docs/cpp/source/conf.py
index b7a0a0172732c..3318ce764f64b 100644
--- a/docs/cpp/source/conf.py
+++ b/docs/cpp/source/conf.py
@@ -174,6 +174,7 @@
 #
 html_theme_options = {
     "canonical_url": "https://pytorch.org/docs/stable/",
+    "analytics_id": "GTM-T8XT4PS",
     "collapse_navigation": False,
     "logo": {"text": "Home"},
     "icon_links": [
diff --git a/docs/source/_static/img/dynamic_shapes/dynamic_shapes_example_specialization.png b/docs/source/_static/img/dynamic_shapes/dynamic_shapes_example_specialization.png
new file mode 100644
index 0000000000000..3f70bb4464013
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/dynamic_shapes_example_specialization.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png b/docs/source/_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png
new file mode 100644
index 0000000000000..0ea39efd3e5ad
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png b/docs/source/_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png
new file mode 100644
index 0000000000000..376bbf6e0972b
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png b/docs/source/_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png
new file mode 100644
index 0000000000000..f5bdfa7a11036
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse3_specialization.png b/docs/source/_static/img/dynamic_shapes/tlparse3_specialization.png
new file mode 100644
index 0000000000000..7a28844925ec5
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse3_specialization.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse4_pgo.png b/docs/source/_static/img/dynamic_shapes/tlparse4_pgo.png
new file mode 100644
index 0000000000000..9550b39842e2b
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse4_pgo.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png b/docs/source/_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png
new file mode 100644
index 0000000000000..4a180bf73664b
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png b/docs/source/_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png
new file mode 100644
index 0000000000000..ad5b8c295c8c6
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png b/docs/source/_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png
new file mode 100644
index 0000000000000..525d811432557
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse8_compilation_metrics.png b/docs/source/_static/img/dynamic_shapes/tlparse8_compilation_metrics.png
new file mode 100644
index 0000000000000..bf031d6d371e9
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse8_compilation_metrics.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse9_debugging_guards.png b/docs/source/_static/img/dynamic_shapes/tlparse9_debugging_guards.png
new file mode 100644
index 0000000000000..6471f4969165a
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse9_debugging_guards.png differ
diff --git a/docs/source/_static/img/inductor_provenance/index_2.png b/docs/source/_static/img/inductor_provenance/index_2.png
new file mode 100644
index 0000000000000..6e156bffdf0af
Binary files /dev/null and b/docs/source/_static/img/inductor_provenance/index_2.png differ
diff --git a/docs/source/_static/img/inductor_provenance/kernel_source_1.png b/docs/source/_static/img/inductor_provenance/kernel_source_1.png
new file mode 100644
index 0000000000000..95e850cc89deb
Binary files /dev/null and b/docs/source/_static/img/inductor_provenance/kernel_source_1.png differ
diff --git a/docs/source/_static/img/inductor_provenance/kernel_source_2.png b/docs/source/_static/img/inductor_provenance/kernel_source_2.png
new file mode 100644
index 0000000000000..96cbcd40dcdd1
Binary files /dev/null and b/docs/source/_static/img/inductor_provenance/kernel_source_2.png differ
diff --git a/docs/source/_static/img/inductor_provenance/kernel_source_3.png b/docs/source/_static/img/inductor_provenance/kernel_source_3.png
new file mode 100644
index 0000000000000..a759be35a5d6b
Binary files /dev/null and b/docs/source/_static/img/inductor_provenance/kernel_source_3.png differ
diff --git a/docs/source/accelerator/autoload.md b/docs/source/accelerator/autoload.md
new file mode 100644
index 0000000000000..c4ebd83be6104
--- /dev/null
+++ b/docs/source/accelerator/autoload.md
@@ -0,0 +1,80 @@
+# Autoload Mechanism
+
+The **Autoload** mechanism in PyTorch simplifies the integration of a custom backend by enabling automatic discovery and initialization at runtime. This eliminates the need for explicit imports or manual initialization, allowing developers to seamlessly integrate a new accelerator or backend into PyTorch.
+
+## Background
+
+The **Autoload Device Extension** proposal in PyTorch is centered on improving support for various hardware backend devices, especially those implemented as out-of-the-tree extensions (not part of PyTorch’s main codebase). Currently, users must manually import or load these device-specific extensions to use them, which complicates the experience and increases cognitive overhead.
+
+In contrast, in-tree devices (devices officially supported within PyTorch) are seamlessly integrated—users don’t need extra imports or steps. The goal of autoloading is to make out-of-the-tree devices just as easy to use, so users can follow the standard PyTorch device programming model without explicit loading or code changes. This would allow existing PyTorch applications to run on new devices without any modification, making hardware support more user-friendly and reducing barriers to adoption.
+
+For more information about the background of **Autoload**, please refer to its [RFC](https://github.com/pytorch/pytorch/issues/122468).
+
+## Design
+
+The core idea of **Autoload** is to Use Python’s plugin discovery (entry points) so PyTorch automatically loads out-of-tree device extensions when torch is imported—no explicit user import needed.
+
+For more instructions of the design of **Autoload**, please refer to [**How it works**](https://docs.pytorch.org/tutorials/unstable/python_extension_autoload.html#how-it-works).
+
+## Implementation
+
+This tutorial will take **OpenReg** as a new out-of-the-tree device and guide you through the steps to enable and use the **Autoload** mechanism.
+
+### Entry Point Setup
+
+To enable **Autoload**, register the `_autoload` function as an entry point in [setup.py](https://github.com/pytorch/pytorch/blob/main/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py) file.
+
+::::{tab-set}
+
+:::{tab-item} Python
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
+    :language: python
+    :start-after: LITERALINCLUDE START: SETUP
+    :end-before: LITERALINCLUDE END: SETUP
+    :linenos:
+    :emphasize-lines: 9-13
+```
+
+:::
+
+::::
+
+### Backend Setup
+
+Define the initialization hook `_autoload` for backend initialization in [torch_openreg](https://github.com/pytorch/pytorch/blob/main/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py). This hook will be automatically invoked by PyTorch during startup.
+
+::::{tab-set-code}
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py
+    :language: python
+    :start-after: LITERALINCLUDE START: AUTOLOAD
+    :end-before: LITERALINCLUDE END: AUTOLOAD
+    :linenos:
+```
+
+::::
+
+## Result
+
+After setting up the entry point and backend, build and install your backend. Now, we can use the new accelerator without explicitly importing it.
+
+```{eval-rst}
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`terminal;1em;` Without Autoload
+
+           >>> import torch
+           >>> import torch_openreg
+           >>> torch.tensor(1, device="openreg")
+           tensor(1, device='openreg:0')
+
+    .. grid-item-card:: :octicon:`terminal;1em;` With Autoload
+
+           >>> import torch # Automatically import torch_openreg
+           >>>
+           >>> torch.tensor(1, device="openreg")
+           tensor(1, device='openreg:0')
+```
diff --git a/docs/source/accelerator/index.md b/docs/source/accelerator/index.md
index 4c604ba10b01a..70f25812bb9eb 100644
--- a/docs/source/accelerator/index.md
+++ b/docs/source/accelerator/index.md
@@ -2,6 +2,10 @@
 
 Since PyTorch 2.1, the community has made significant progress in streamlining the process of integrating new accelerators into the PyTorch ecosystem. These improvements include, but are not limited to: refinements to the `PrivateUse1` Dispatch Key, the introduction and enhancement of core subsystem extension mechanisms, and the device-agnostic refactoring of key modules (e.g., `torch.accelerator`, `memory management`). Taken together, these advances provide the foundation for a **robust**, **flexible**, and **developer-friendly** pathway for accelerator integration.
 
+```{note}
+This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
+```
+
 ## Why Does This Matter?
 
 This integration pathway offers several major benefits:
@@ -10,16 +14,6 @@ This integration pathway offers several major benefits:
 * **Future-proofing**: This is the default integration path for all future PyTorch features, meaning that as new modules and features are added, they will automatically support scaling to new accelerators if this path is followed.
 * **Autonomy**: Vendors maintain full control over their accelerator integration timelines, enabling fast iteration cycles and reducing reliance on upstream coordination.
 
-## About This Document
-
-This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation.
-
-The goal is to help developers:
-
-* Understand the full scope of accelerator integration;
-* Follow best practices to quickly launch new accelerators;
-* Avoid common pitfalls through clear, targeted examples.
-
 ## Target Audience
 
 This document is intended for:
@@ -27,25 +21,28 @@ This document is intended for:
 * **Accelerator Developers** who are integrating accelerator into PyTorch;
 * **Advanced PyTorch Users** interested in the inner workings of key modules;
 
-## Quick Overview
+## About This Document
 
-This document outlines the key processes and practical scenarios involved in integrating new devices into PyTorch, providing developers with a comprehensive and detailed guide for bringing up new backends. The discussion is structured around four major axes:
+This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation, and this series is structured around four major axes:
 
 * **Runtime**: Covers core components such as Event, Stream, Memory, Generator, Guard, Hooks, as well as the supporting C++ scaffolding.
 * **Operators**: Involve the minimum necessary set of operators, forward and backward operators, fallback operators, fallthroughs, STUBs, etc. in both C++ and Python implementations.
 * **Python Frontend**: Focuses on Python bindings for modules and device-agnostic APIs.
 * **High-level Modules**: Explores integration with major subsystems such as `AMP`, `Compiler`, `ONNX`, and `Distributed` and so on.
 
-Next, we will officially embark on the integration journey for a new PyTorch accelerator.
+The goal is to help developers:
 
-```{note}
-This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
-```
+* Understand the full scope of accelerator integration;
+* Follow best practices to quickly launch new accelerators;
+* Avoid common pitfalls through clear, targeted examples.
+
+Next, we will delve into each chapter of this guide. Each chapter focuses on a key aspect of integration, providing detailed explanations and illustrative examples. Since some chapters build upon previous ones, readers are encouraged to follow the sequence to achieve a more coherent understanding.
 
 ```{toctree}
 :glob:
 :maxdepth: 1
 
+autoload
 operators
 ```
 
diff --git a/docs/source/accelerator/operators.md b/docs/source/accelerator/operators.md
index 2930d6b7f6e46..d5ae2aa5a2c68 100644
--- a/docs/source/accelerator/operators.md
+++ b/docs/source/accelerator/operators.md
@@ -169,7 +169,7 @@ Of course, global fallbacks can also be combined with a blacklist of fallbacks,
 
 ### PyTorch STUB
 
-PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the `Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics).
+PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the {ref}`Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics).
 
 ```{note}
 The `STUB` method currently supports only a limited set of operators. For new accelerator devices, the advantage of the `STUB` method is that it significantly reduces the cost of development at the cost of a small performance overhead. PyTorch currently does not clearly list the set of operators that can be registered through `STUB`. Due to the large number of related operators, only the query method for the supported operator list is provided here.
diff --git a/docs/source/bottleneck.rst b/docs/source/bottleneck.rst
deleted file mode 100644
index ed5caf3fff58c..0000000000000
--- a/docs/source/bottleneck.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-torch.utils.bottleneck
-======================
-
-.. automodule:: torch.utils.bottleneck
-.. currentmodule:: torch.utils.bottleneck
-
-`torch.utils.bottleneck` is a tool that can be used as an initial step for
-debugging bottlenecks in your program. It summarizes runs of your script with
-the Python profiler and PyTorch's autograd profiler.
-
-Run it on the command line with
-
-::
-
-    python -m torch.utils.bottleneck /path/to/source/script.py [args]
-
-where [args] are any number of arguments to `script.py`, or run
-``python -m torch.utils.bottleneck -h`` for more usage instructions.
-
-.. warning::
-    Because your script will be profiled, please ensure that it exits in a
-    finite amount of time.
-
-.. warning::
-    Due to the asynchronous nature of CUDA kernels, when running against
-    CUDA code, the cProfile output and CPU-mode autograd profilers may
-    not show correct timings: the reported CPU time reports the amount of time
-    used to launch the kernels but does not include the time the kernel
-    spent executing on a GPU unless the operation does a synchronize.
-    Ops that do synchronize appear to be extremely expensive under regular
-    CPU-mode profilers.
-    In these case where timings are incorrect, the CUDA-mode autograd profiler
-    may be helpful.
-
-.. note::
-    To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to
-    look at, you should first check if your script is CPU-bound
-    ("CPU total time is much greater than CUDA total time").
-    If it is CPU-bound, looking at the results of the CPU-mode autograd
-    profiler will help. If on the other hand your script spends most of its
-    time executing on the GPU, then it makes sense to start
-    looking for responsible CUDA operators in the output of the CUDA-mode
-    autograd profiler.
-
-    Of course the reality is much more complicated and your script might not be
-    in one of those two extremes depending on the part of the model you're
-    evaluating. If the profiler outputs don't help, you could try looking at
-    the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
-    However, please take into account that the NVTX overhead is very high and
-    often gives a heavily skewed timeline. Similarly, ``Intel® VTune™ Profiler``
-    helps to analyze performance on Intel platforms further with
-    :func:`torch.autograd.profiler.emit_itt()`.
-
-.. warning::
-    If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
-    (cProfile) will include the CUDA startup time (CUDA buffer allocation cost)
-    in its time reporting. This should not matter if your bottlenecks result
-    in code much slower than the CUDA startup time.
-
-For more complicated uses of the profilers (like in a multi-GPU case),
-please see https://docs.python.org/3/library/profile.html
-or :func:`torch.autograd.profiler.profile()` for more information.
diff --git a/docs/source/compile/dynamic_shapes_advanced_control_options.md b/docs/source/compile/dynamic_shapes_advanced_control_options.md
new file mode 100644
index 0000000000000..e822766817175
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_advanced_control_options.md
@@ -0,0 +1,239 @@
+(dynamic_shapes_advanced_control_options)=
+# Advanced Options to Control Dynamic Behavior
+
+PyTorch provides several advanced options to control dynamic behavior.
+These options requires a deep understanding of the PyTorch internals and
+may inlvolve setting additional tools. These options include:
+
+* Profile-Guided Optimization (PGO) is a technique that allows the compiler
+  to save automatic dynamic decisions and reuse them across jobs.
+* Compiler Collective is a feature that is used to modify automatic dynamic
+  shapes behavior by inferring if an input is dynamic based on whether
+  its size varies across ranks.
+
+## Profile-Guided Optimization (PGO)
+
+Profile-Guided Optimization (PGO) enhances automatic dynamic by sharing profiling decisions across runs of your model. Specifically, it serializes all the choices made by automatic dynamic into a file on disk. You can then copy this file—or store it in a centralized metadata service like S3—and reuse it on other machines to ensure consistent behavior across environments.
+
+For the purposes of the rest of this tutorial, you can use the following environmental variables to turn on PGO locally `TORCH_COMPILE_JOB_ID=1 TORCH_DYNAMO_AUTOMATIC_DYNAMIC_LOCAL_PGO=1`
+
+(identifying-dynamic-elements-marked-by-pgo)=
+### Identifying Dynamic Elements Marked by PGO
+
+Use `tlparse` to find line numbers of interest and check for multiple values
+seen for inputs.
+
+To determine which elements are marked as dynamic by Profile-Guided Optimization (PGO),
+follow these steps using `tlparse`:
+
+1. In the `tlparse` output, identify the line number of the frame of interest. Example:
+
+   ```{image} ../_static/img/dynamic_shapes/tlparse4_pgo.png
+   ```
+
+2. Open `local_code` using `put_local_code_state_` or `put_remote_code_state_` for the
+   latest frame (for example, 6/1).
+
+   Each `?` indicates that multiple values have been observed for this input.
+
+   For instance, the following output shows that the input `L['m']` has been seen with
+   multiple sizes at `size[0]`, but the stride has consistently been 1:
+
+   ```
+   /data/users/bobren/a/pytorch/r2.py:2:func:
+   L['m']: fully dynamic scalar or tensor
+   L['x']: tensor size=[?] stride=[1]
+   L['y']: tensor size=[?] stride=[1]
+   L['z']: tensor size=[?] stride=[1]
+   ```
+
+```{note}
+If an element is marked as dynamic by PGO, it does not guarantee that it will remain dynamic in the graph. Specialization can revert it to a static state.
+```
+
+## Compiler Collective
+
+Different ranks can communicate with each other to share observed sizes. In the second
+iteration, automatic dynamic uses this information to determine which elements to mark
+as dynamic based on inputs seen across all ranks. Check this [PR](https://github.com/pytorch/pytorch/pull/130935) for more details.
+To enable this feature, use `enable_compiler_collectives=True` with the `@config.patch`
+decorator.
+
+```python
+@config.patch(enable_compiler_collectives=True)
+```
+
+```{note}
+This feature enables the use of collectives during compilation to
+synchronize behavior across ranks. Currently, it is used to modify
+automatic dynamic shapes behavior by inferring if an input is dynamic
+based on whether its size varies across ranks. Since this synchronization
+uses collectives, all ranks must run compilation simultaneously; ranks must
+not diverge with graph breaks. This is most reliably achieved by ensuring
+torch is only run on SPMD programs. Violating this invariant may result in
+deadlocking NCCL and encountering a NCCL timeout.
+```
+
+## Reducing Compilations: Step by Step
+
+If you have a model that you can run on your master job and have a `tlparse`,
+here's whatyou should do next:
+
+### Step 1: Mark Dynamic Elements
+
+The first step is to reduce initial compilations that are eventually optimized away
+by automatic dynamic or PGO. This is straightforward because we know it will work
+upfront. If, in one run, a frame starts with static graphs and converges to
+dynamic graphs, and if you notice a reduction in the number of compiled
+frames in a second (warm) PGO-enabled run, it's likely due to this optimization.
+
+This is a two-step process:
+
+1. Find elements marked as dynamic by PGO or automatic dynamic.
+2. Mark them as dynamic using one of the {ref}`user_annotations`.
+
+#### How to Identify Elements to Mark as Dynamic
+
+Follow these guidelines:
+
+1. **PGO artifact:** Follow the steps in {ref}`identifying-dynamic-elements-marked-by-pgo`.
+2. **Dynamic Logs:** If you have a run with `TORCH_LOGS="+dynamic"`, each
+time a new dynamic dimension is allocated, a debug line will specify it
+along with the input name.
+3. **Compare Graphs:** For frames with reduced compilations across runs,
+inspect the Dynamo graphs in the second run or the latest runs in the
+cold run. Look for elements marked as dynamic in those graphs. Specifically,
+find graphs that are similar (once specialized and once dynamic).
+
+Even without a warm run, you can inspect all graphs for a specific frame
+to see if some are similar and converge to a dynamic version.
+
+For example, in the following `tlparse` snapshot, Dynamo graphs 20/0,
+20/1, and 20/2 are similar except for different sizes (for example,
+graph 20/0 vs. graph 20/2). In the Dynamo graph of 20/2, sizes `s0`,
+`s1`, and `s5` are used for `rotary_pos_emb_` and `x`.
+
+```{image} ../_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png
+```
+
+```{tip}
+Two graphs are considered similar if they have the same sequence of calls for
+torch operations and the same tensor inputs. Variations may exist in integer
+inputs that could be inlined in the specialized version or arithmetic
+computations that only exist in the dynamic version due to inlining in the
+static version.
+```
+
+### Step 2: Debugging: Identifying Missed Opportunities
+
+The complexity of debugging can vary greatly depending on the issues you
+encounter. The end result is often to find a bug, enable a flag, or modify
+user/framework code.
+
+#### Finding Similar Graphs
+
+Start by identifying a group of similar graphs that you might want to combine
+into one dynamic graph, as discussed in the previous section on comparing
+graphs. If you can't find any similar graphs, there's nothing further to do
+in this step.
+
+#### Quick Checks: Fail Fast
+
+After finding similar graphs, you want to understand why the have recompilations.
+Check the following:
+
+1. **Check Recompile Reasons:** For graphs you believe are similar, click on
+`recompile_reason` in the `tlparse` output for the later graph. Ensure the
+reason is size-related and not due to other factors. For example, while
+in these screenshot the recomplile reason is size-related:
+
+```{image} ../_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png
+```
+
+In the one below it is not, which indicates that dynamic shapes won't resolve it:
+
+```{image} ../_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png
+:width: 500px
+:align: center
+```
+
+2. **Compare Guards Files:** Ensure there are no guards on non-size-related
+elementsthat exist in one graph but not the others.
+
+3. **Early Check for Custom Triton Kernels:** Check if your model calls custom
+Triton kernels with `tl.constexpr` arguments, as these are always
+specialized. If your model receives different values for these arguments,
+it could be a source of recompilation.
+
+
+## **Identifying and Fixing Recompilation Causes**
+
+1. **Is Something Not Marked Dynamic but Should Be?** Determine if an input was
+marked dynamic and got specialized or was not marked dynamic at all. You can
+identify this by:
+
+    * Checking the Dynamo graph - look for `Sym(number)`. For example:
+
+      ```
+      Sym(256) vs Sym(s0)
+      ```
+
+    * Using dynamic logs:
+
+      ```
+      ["TORCH_LOGS=+dynamic"]
+      create_symbol s2 = 2 for L['self']._modules['cle ...
+      ```
+
+    * Reviewing guards files. If a tensor size is dynamic, it will be indicated as `None`:
+
+      ```
+      TENSOR_MATCH:check_tensor(L['self'].x._parameters['weight']], Parameter, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=True, size=[None, None], stride=[None, 1])
+      ```
+
+2. **Why Is It Not Marked Dynamic?** If you determine an element is not marked dynamic, consider:
+
+    * Checking if it's an `nn` module property, parameter, or field. Verify setting for the flags:
+      * `force_parameter_static_shapes = True`
+      * `force_nn_module_property_static_shapes = True`
+      * `allow_unspec_int_on_nn_module = False`
+      * Or using the dynamic allow list to mark it dynamic, which should have the highest priority.
+
+    ```{tip}
+    Marking elements one by one can be time-consuming. Initially, flip the flags to
+    identify any blocking specializations, then decide how to mark them
+    dynamic at the end of the process.
+    ```
+
+    * If you feel, like it could be a bug, please file a bug report and mark
+    with the `module: dynamic shapes` label. Check the list of known issues in
+    [this list](https://github.com/pytorch/pytorch/issues?q=sort%3Aupdated-desc+state%3Aopen+label%3A%22module%3A+dynamic+shapes%22).
+
+3. **Is a Dynamic Element Getting Specialized?** Determine why it is specialized.
+It could be due to user code (such as an `if` condition), framework code, or a
+call  to a Triton kernel. To identify the reason for specialization:
+
+    * **Using tlparse:** Check the `compilation_metrics` for a specialization section, which will indicate what got specialized and the user and framework stack when it happened. Example:
+
+    ```{image} ../_static/img/dynamic_shapes/tlparse8_compilation_metrics.png
+    ```
+
+    The log above indicates that `s0` is specialized to `33` due to the following code:
+
+    ```
+    `if self.x ==33` at example4.py line 16.
+    ```
+
+    * **+Dynamic Logs:** pass `["TORCH_LOGS=+dynamic"]`. Look for the first specialization, as once a variable is specialized, all dependent variables get specialized too.
+
+    Example log:
+
+    ```
+    torch/fx/experimental/symbolic_shapes.py:6557] [0/2] eval Eq(s0, 33) [guard added] if self.x ==33:  # example4.py:16 in forward (_dynamo/variables/tensor.py:1242 in evaluate_expr), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s0, 33)"
+    V0228 12:04:24.190000 2990033 torch/fx/experimental/symbolic_shapes.py:6000] [0/2] _update_var_to_range s0 = VR[33, 33] (update)
+    ```
+
+    The log above indicates that `s0` is specialized to `33` due to the following code:
+    ```
+    if self.x ==33. At example4.py like 16.
+    ```
diff --git a/docs/source/compile/dynamic_shapes_backed_unbacked.md b/docs/source/compile/dynamic_shapes_backed_unbacked.md
new file mode 100644
index 0000000000000..e22e28cdc7b7e
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_backed_unbacked.md
@@ -0,0 +1,45 @@
+(backed-vs-unbacked-symints)=
+# Backed vs Unbacked Symints
+
+Backed `SymInts` are symbolic integers that have a concrete value or "hint"
+associated with them. This means that torch can use these values to make
+decisions about control flow, such as determining which branch of code
+to execute. They are typically derived from operations where the size or
+value is known or can be inferred.
+
+Unbacked `SymInts` are symbolic integers that do not have a concrete value or
+hint. They often arise from data-dependent operations, such as `.nonzero()`
+or `.item()`, where the size or value cannot be determined at compile time.
+Since they lack a concrete value, they cannot be used for control flow
+decisions, and attempting to do so requires a graph break.
+
+Unbacked `SymInts` use *oblivious-size reasoning* which is particularly
+useful when you are dealing with
+{ref}`0/1 specialization recompilation problem <zero-one-specialization>`.
+
+In summary, backed `SymInts` have known values that can be used for
+decision-making, while unbacked `SymInts` do not, requiring special handling
+to avoid graph breaks.
+
+Unbacked symbolic integers can be too restrictive, causing most PyTorch programs
+to fail. To address this, you can use the following methods and APIs as
+workaround:
+
+* Use higher-level APIs like `empty` instead of `empty_strided` to create tensors.
+This ensures the tensor is non-overlapping and dense, avoiding unnecessary stride
+sorting and guard creation.to avoid unnecessary recomputation of these properties.
+
+* Modify your code to make precomputed properties *lazy*. This ensures that
+guards on unbacked symbolic integers are only applied when necessary,
+reducing computational overhead.
+
+## How to use unbacked
+To use unbacked APIs, replace `mark_dynamic` with `mark_unbacked` and
+`TORCH_COMPILE_DYNAMIC_SOURCES` with `TORCH_COMPILE_UNBACKED_SOURCES`.
+This tells the compiler to treat an input as unbacked.
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`torch.export`
+* {ref}`what_is_a_specialization`
+```
diff --git a/docs/source/compile/dynamic_shapes_beyond_the_basics.md b/docs/source/compile/dynamic_shapes_beyond_the_basics.md
new file mode 100644
index 0000000000000..45cbe70a5b1c1
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_beyond_the_basics.md
@@ -0,0 +1,10 @@
+(dynamic_shapes_beyond_the_basics)=
+# Beyond the Basics
+
+This section covers some advanced topics related to dynamic shapes. This includes more complex explanations of how dynamic shapes work, 0/1 specialization problems, and so on.
+
+```{toctree}
+:maxdepth: 1
+dynamic_shapes_zero_one_specialization
+dynamic_shapes_backed_unbacked
+```
diff --git a/docs/source/compile/dynamic_shapes_core_concepts.md b/docs/source/compile/dynamic_shapes_core_concepts.md
new file mode 100644
index 0000000000000..b8ab3d0fe4ce6
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_core_concepts.md
@@ -0,0 +1,134 @@
+(dynamic_shapes_core_concepts)=
+# Dynamic Shapes Core Concepts
+
+This section described the core concepts of dynamic shapes in PyTorch. It is intended to be a
+reference for engineers working on the PyTorch compiler stack and anyone who wants to understand
+the inner workings of dynamic shapes.
+
+## Symbolic integers
+Symbolic integers (Symints) are used to represent variables that can span a range. For example:
+```python
+x = torch.randn(5, 5) # this tensor has a shape [5, 5]
+torch._dynamo.decorators.mark_dynamic(x, 0)
+x = torch.randn(5, 5) # this tensor has a shape [s0, 5]
+y = torch.cat([x, x], dim=0) # this tensor has a shape [2*s0, 5]
+```
+
+However, `z = x * y` would throw an error since we know that pointwise operation like multiply must
+operate on same sized tensors but we know statically `s0 != 2 * s0`. Astute readers may point out
+that this is not true when `s0 == 0` and the reason why that doesn't matter here is described in
+{ref}`zero-one-specialization`.
+
+## Guards
+
+In `torch.compile`, a guard is a mechanism that is used to ensure the validity of a compiled code graph.
+By default, when you make a variable dynamic, it can range from `[-inf, inf]`. For example:
+
+```python
+def foo(x): return x / 2
+
+This works for any dynamic x. But if your code is:
+
+def foo(x)
+    if x > 5:
+        return x / 2
+    return x / 3
+```
+If you call `foo(6)`, it returns `x / 2` and adds a guard `x > 5`. Calling `foo(4)` later will
+require recompilation because the guard is broken.
+
+## Runtime Asserts
+You can use runtime asserts to provide hints when you know certain facts, like batch size being less than 100:
+
+```python
+def foo(batch_size):
+    torch._check(batch_size < 100)
+    if batch_size < 100:
+        return do_something
+    return do_something_else()
+```
+
+## "Hint" Value
+
+A "hint value" in the context of `torch.compile` refers to the actual values known during the compilation process that help the JIT compiler make decisions about expressions. Hint values are particularly useful for handling dynamic shapes, as they provide concrete information that guides the compilation without requiring recompilation for varying dimensions.
+
+
+## Dynamic Behavior Overview
+
+PyTorch assumes static shapes by default. When a size change is detected, it attempts to
+recompile with dynamic input, although this may fail if there are conditional branches
+or missing support for dynamic shapes. To diagnose overspecialization, you can set
+`TORCH_LOGS=dynamic` to view "eval" entries that indicate when and why guards are added.
+
+If you anticipate a dimension will be dynamic, you can use `torch._dynamo.mark_dynamic(tensor, dim)`
+to mark it in advance, specifying `min` and `max` values if known. Using `torch.compile(dynamic=False)`
+disables automatic dynamic shapes, leading to recompilation for each unique size. Conversely,
+`torch.compile(dynamic=True)` aims to use dynamic shapes as much as possible which is most useful
+for small and may not be suitable for large models due to potential crashes or performance issues.
+
+You can whitelist specific sources to be marked as dynamic using the `TORCH_COMPILE_DYNAMIC_SOURCES` environment variable or `torch.compiler.config.dynamic_sources`. This is particularly useful for large
+models with graph breaks, as you can maintain dynamism across graph breaks since
+source names stay consistent. You can also use this to mark integers as dynamic. The format is a comma-delimited list of source names, for example, `"L['x'], L['y']"`.
+You can also use regexes, for example, `"L\['x.*'\], L\['y.*'\]")`.
+This whitelist takes precedence over other flags like `dynamic=False` `force_nn_module_property_static_shapes`, and `force_parameter_static_shapes`.
+
+Sometimes it can be cumbersome to find the right inputs to mark as dynamic. If
+you're willing to take a performance hit for the first batch, one other affordable
+option we have are the `eager_then_compile` stances which derive dynamism for you.
+See {func}`torch.compiler.set_stance` for more details.
+
+
+## Overall Architecture
+
+Symbolic shapes workflow:
+
+1. When compiling a frame in Dynamo, we allocate a `ShapeEnv` (attached to `FakeTensorMode`) to
+track symbolic shapes.
+2. We allocate symbolic sizes for tensors on entry, based on policy decisions.
+3. We propagate symbolic sizes through operators, maintaining both FX IR for symbolic compute export
+and Sympy expressions for reasoning.
+4. We add guards based on conditionals during Dynamo tracing or Inductor optimization, induced from both Python and C++.
+5. Guards can simplify symbolic variables. For instance, asserting `s0 == 4` allows replacing all occurrences of `s0` with `4`.
+6. After tracing and optimizing, we install all guards with the compiled code, ensuring reusability only if all guards evaluate true.
+
+## Internal API Class Hierarchy
+
+### Python Classes
+
+- **`SymInt`/`SymFloat`/`SymBool`**: User-visible classes that simulate their `int`/`float`/`bool` counterparts. Adding two `SymInts` produces a new `SymInt` that symbolically tracks the integer addition.
+
+- **`SymNode`**: Internal structure (accessible via `symint.node`) that holds actual symbolic tracking information. `SymNode` is type-erased, making it convenient to represent mixed-type operations.
+
+- **`ShapeEnv`**: Per-compile context state that tracks all free symbols and guards accumulated so far. Every `SymNode` records its `ShapeEnv` (but not vice versa; `SymNodes` are only used if they participate in a guard).
+
+### C++ Equivalents
+
+- **`c10::SymInt`/`SymFloat`/`SymBool`**: User-visible classes that simulate `int`/`float`/`bool`
+- **`c10::SymNode`/`SymNodeImpl`**: Analogous to Python `SymNode`
+- **No C++ `ShapeEnv`**: For debugging ease, the entire symbolic reasoning apparatus remains in Python
+
+When writing code traceable with `make_fx`, it must handle `SymInt`/`SymFloat`/`SymBool` flowing through it.
+
+## Value Ranges and Constraints
+
+Symbolic variables maintain **value ranges** that specify the set of possible values. By default:
+- Size-like unbacked `SymInts` have value range `[0, Inf]`
+- Regular unbacked `SymInts` have value range `[-Inf, Inf]`
+
+When assertions are made (e.g., `torch._check(x == y)`), the system:
+1. Attempts to replace unbacked symbols with equivalent expressions
+2. Refines value ranges based on the assertion
+3. Remembers boolean expressions that are always true
+
+Important files:
+
+- C++ SymInt API: `c10/core/SymInt.h`, `SymFloat.h`, `SymBool.h`
+- Python SymInt API: `torch/__init__.py` (look for `SymInt/SymFloat/SymBool`)
+- C++ plumbing: `c10/core/SymNodeImpl.h`, `torch/csrc/utils/python_symnode.h`, `torch/csrc/jit/python/init.cpp`
+- Python infrastructure: `torch/fx/experimental/symbolic_shapes.py`
+- Other important files: `torch/_subclasses/fake_tensor.py`, `torch/_meta_registrations.py`, decomps, PrimTorch refs
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`dynamic_shapes_troubleshooting`
+```
diff --git a/docs/source/compile/dynamic_shapes_debugging_tlparse_torch_logs.md b/docs/source/compile/dynamic_shapes_debugging_tlparse_torch_logs.md
new file mode 100644
index 0000000000000..46c7cb2daee4c
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_debugging_tlparse_torch_logs.md
@@ -0,0 +1,101 @@
+(debugging-tlparse-torch-logs)=
+# Debugging with `tlparse` and `TORCH_LOGS=dynamic`
+
+`tlparse` is a tool used for analyzing and understanding the compilation
+process in PyTorch, particularly when dealing with dynamic shapes. It helps
+identify where guards and specializations occur in your code.
+
+`TORCH_LOGS=dynamic` is an environment variable setting that enables detailed
+logging of dynamic shape operations, providing insights into how symbolic
+shapes are handled during execution.
+
+This section will guide you through using `tlparse` and `TORCH_LOGS=dynamic` to
+troubleshoot dynamic shape issues in your code, including debugging
+specialization, guards, and more.
+
+# Debugging Specialization
+
+In the following example, `x.shape[0]` is dynamic but becomes specialized due to multiplication:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, y):
+    return x * y
+
+x = torch.randn(5)
+y = torch.randn(5)
+torch._dynamo.decorators.mark_dynamic(x, 0)
+
+fn(x, y)
+```
+
+By using `TORCH_LOGS=dynamic`, you can observe this specialization in the logs:
+
+```xml
+TORCH_LOGS=dynamic python tl.py
+I0721 11:10:00.950000 845259 torch/fx/experimental/symbolic_shapes.py:3776] [0/0] create_env
+I0721 11:10:01.030000 845259 torch/fx/experimental/symbolic_shapes.py:5117] [0/0] create_symbol s77 = 5 for L['x'].size()[0] [2, int_oo] return x * y  # tl.py:5 in fn (_dynamo/variables/builder.py:3466 in <lambda>), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="s77" or to suppress this message run with TORCHDYNAMO_EXTENDED_ADVICE="0"
+I0721 11:10:01.038000 845259 torch/fx/experimental/symbolic_shapes.py:7211] [0/0] eval Eq(s77, 5) [guard added] return x * y  # tl.py:5 in fn (_subclasses/fake_impls.py:922 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s77, 5)"
+```
+
+The line `eval Eq(s77, 5) [guard added] return x * y # tl.py:5` indicates the specialization.
+
+## Debugging Guards
+
+Consider the following code, which may cause recompilations due to dynamic
+shapes:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, y):
+    if x.shape[0] < 10:
+        return x * y
+
+x = torch.randn(5)
+y = torch.randn(5)
+torch._dynamo.decorators.mark_dynamic(x, 0)
+torch._dynamo.decorators.mark_dynamic(y, 0)
+
+fn(x, y)
+```
+
+To identify where dynamic shape guards originate, use `tlparse`. Here is an example tlparse output:
+
+```{image} ../_static/img/dynamic_shapes/tlparse9_debugging_guards.png
+```
+
+By clicking on the `dynamo_cpp_guards` link, you can view all guards from the compilation, including the symbolic shape guard `L['x'].size()[0] <= 9`.
+
+Astute readers will notice the 0/1 specialization where we guard on `L['x'].size()[0] >= 2`. By modifying the code to use unbacked symbols, this guard is removed:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, y):
+    # Necessary runtime assert since we can't guard on unbacked
+    torch._check(x.shape[0] < 10)
+    if x.shape[0] < 10:
+        return x * y
+
+x = torch.randn(5)
+y = torch.randn(5)
+torch._dynamo.decorators.mark_unbacked(x, 0)
+torch._dynamo.decorators.mark_unbacked(y, 0)
+
+fn(x, y)
+```
+
+Now, this compiled region can be used for inputs of size 0 and 1:
+
+```{image} ../_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png
+```
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`troubleshooting_guardondatadependentsymnode_errors`
+```
diff --git a/docs/source/compile/dynamic_shapes_troubleshooting.md b/docs/source/compile/dynamic_shapes_troubleshooting.md
new file mode 100644
index 0000000000000..6217f9e9f47cf
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_troubleshooting.md
@@ -0,0 +1,14 @@
+(dynamic_shapes_troubleshooting)=
+
+# Troubleshooting Dynamic Shapes
+
+This section contains a list of common issues that you may encounter when using
+dynamic shapes. The section describes how to use `TORCH_LOGS` and `tlparse` to
+debug the issues, as well as provides some general tips and tricks to help you
+resolve the issues.
+
+```{toctree}
+:maxdepth: 1
+dynamic_shapes_debugging_tlparse_torch_logs
+dynamic_shapes_troubleshooting_guardon_errors
+```
diff --git a/docs/source/compile/dynamic_shapes_troubleshooting_guardon_errors.md b/docs/source/compile/dynamic_shapes_troubleshooting_guardon_errors.md
new file mode 100644
index 0000000000000..5261442d9e704
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_troubleshooting_guardon_errors.md
@@ -0,0 +1,411 @@
+(troubleshooting_guardondatadependentsymnode_errors)=
+
+# Troubleshooting GuardOnDataDependentSymNode Errors
+
+When working with PyTorch models that have data-dependent control flow (using functions
+like `item()`, `tolist()`, or `nonzero())`, you may encounter `GuardOnDataDependentSymNode` errors.
+This section explains what these errors are and how to fix them.
+
+## Common Error Pattern
+The following output shows the common error pattern `GuardOnDataDependentSymNode` errors:
+
+```sh
+torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(u2, -1) (unhinted: Eq(u2, -1)).  (Size-like symbols: none)
+
+Potential framework code culprit (scroll up for full backtrace):
+  File "/data/users/ezyang/a/pytorch/torch/_prims_common/__init__.py", line 855, in infer_size
+    if d == -1:
+
+For more information, run with TORCH_LOGS="dynamic"
+For extended logs when we create symbols, also add TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="u2"
+If you suspect the guard was triggered from C++, add TORCHDYNAMO_EXTENDED_DEBUG_CPP=1
+For more debugging help, see https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit?usp=sharing
+```
+
+## Root Cause
+
+These errors occur when PyTorch tries to convert a symbolic quantity (for example, `u2 == -1`)
+into a concrete value (such as, `False`) to make branching decisions. In a typical scenario,
+where data-dependent sizes are not involved, PyTorch can determine the concrete value at
+compile time and install a guard to ensure the compilation result remains valid. However,
+with data-dependent quantities, the true value is unknown at compile time, resulting in errors.
+
+You can often rewrite your model, by adding `torch._check` or `torch._check_is_size` to
+bypass these issues. This document aims to teach you how.
+
+## Debugging Tools
+
+Here is the list of some of the debugging tools available in PyTorch that you can use to troubleshoot these errors:
+
+* `TORCH_LOGS="dynamic"` - Shows detailed logs about symbolic operations
+* `TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="u2"` - Provides extended logs for specific symbols
+* `TORCHDYNAMO_EXTENDED_DEBUG_CPP=1` - Helps when guards are triggered from C++
+
+## Error Variations
+
+Here is a the list of error variations that you might encounter:
+
+| Error Variations | Description |
+|------------------|-------------|
+| "Could not guard on data-dependent expression" | Occurs when trying to extract a concrete boolean from expressions like u0 == 0 or u0 > 10 |
+| "Could not extract specialized integer from data-dependent expression" | Occurs when trying to extract a concrete integer value. <br/> **Common causes:** <br/> - Control flow that depends on the integer (such as, looping `u0` times) <br/> - Overspecialization in code that could work symbolically |
+
+## How to Diagnose Your Problem
+
+### Step 1: Examine the Potential Framework Culprit (Python Backtrace)
+
+The exception provides a backtrace, which often indicates the problem.
+Given that PT2 backtraces can be lengthy, the error message will also
+suggest a potential framework culprit. For example:
+
+```sh
+Potential framework code culprit (scroll up for full backtrace):
+  File "/data/users/ezyang/a/pytorch/torch/_prims_common/__init__.py", line 855, in infer_size
+    if d == -1:
+```
+
+**Consider the Following:**
+
+* Does it make sense that this condition is triggering a guard on a
+data-dependent symbol?
+* Should we know if the quantity in question is size-like?
+(The exception lists size-like symbols; if a symbol is not listed,
+it might be an arbitrary integer.)
+* If the equation involves two distinct symbols, should we know
+they are actually equal?
+*  If all symbols are size-like but the equation involves 0 or 1,
+are we missing a `guard_size_oblivious` wrapper? (Remember, for
+`guard_size_oblivious` between two size tuples, use `sym_eq` instead
+of regular equality.)
+
+In the example above, testing if `d` (a data-dependent value) is `-1` suggests
+that `d` should be non-negative if it were a size. This indicates a missing
+`torch._check_is_size`. If `d` is already size-like but `numel() == 0` fails,
+consider wrapping it in `guard_size_oblivious`.
+
+Using `TORCH_LOGS=dynamic` and examining the user stack trace is crucial for
+understanding how to fix the problem, as they guide you on how to modify the
+user program.
+
+```sh
+[INFO] create_unbacked_symint u0 [-9223372036854775808, 9223372036854775807] (w.py:40 in custom_op_meta)
+```
+
+This log message indicates where (`w.py:40`) the unbacked `SymInt` was
+allocated. An unbacked `SymInt` may be allocated multiple times, so track
+their equalities:
+
+```sh
+[INFO] set_replacement u1 = u0 (trivial_lhs) ValueRanges(lower=0, upper=9223372036854775807, is_bool=False)
+```
+
+### Step 2: Examine the C++ Backtrace
+
+If the framework code culprit is uninformative, the guard might be in C++. You can
+force a C++ backtrace by running with `TORCHDYNAMO_EXTENDED_DEBUG_CPP=1`. This
+provides a detailed C++ backtrace with Python, CPython, and C10/ATen/libtorch
+frames interspersed. Look for symbols in the `at::` or `c10::` namespace that
+resemble kernel-specific code, likely related to the kernel executed per the Python
+backtrace. If using a non-debug build of PyTorch, inlining may cause missing
+frames, requiring source code investigation to locate the issue. For example, see https://github.com/pytorch/pytorch/pull/118579.
+
+Here is an example C++ backtrace from a debugging session:
+
+```
+[2024-02-08 08:20:45,259] torch.fx.experimental.symbolic_shapes: [INFO]   File "../
+__gen_aten__/out/RegisterCompositeImplicitAutograd.cpp", line 2025, in at::
+(anonymous namespace)::(anonymous namespace)
+::wrapper_CompositeImplicitAutograd_Tensor_narrow(at::Tensor const&, long,
+at::Tensor const&, c10::SymInt) [2024-02-08 08:20:45,259] torch.fx.experimental.
+symbolic_shapes: [INFO]   File "../aten/src/ATen/native/TensorShape.cpp", line 1410,
+in at::native::narrow_tensor_symint(at::Tensor const&, long, at::Tensor const&,
+c10::SymInt) [2024-02-08 08:20:45,259] torch.fx.experimental.symbolic_shapes:
+[INFO]   File "../__gen_aten__/out/core/TensorMethods.cpp", line 52, in long
+at::Tensor::item<long>() const [2024-02-08 08:20:45,259] torch.fx.experimental.
+symbolic_shapes: [INFO]   File "../ATen/core/TensorBody.h", line 4274, in
+at::Tensor::item() const
+```
+
+In this example, `at::native::narrow_tensor_symint` calls into `item`, which
+triggers the guard on a data-dependent `SymNode`. You can modify the C++ code to
+avoid specializing, or verify if you should be in this C++ code (e.g., `start` was
+not expected to be a `Tensor`, and modifying this fixed the problem).
+
+## Tools for Fixing Errors
+
+There are a few important functions which you should use to troubleshoot this problem.
+
+### torch._check(cond, msg_fn)
+
+`torch._check` is a function used to assert conditions at runtime, particularly when dealing with symbolic integers (`SymInts`) in PyTorch.
+
+**Example Usage:**
+
+```python
+torch._check(x.size(0) == y, lambda: f"size mismatch: {x.size(0)} != {y}")
+```
+
+The code above does the following:
+
+* Creates a deferred runtime assertion instead of a compile-time guard
+* Teaches the symbolic reasoning system facts about your unbacked SymInts
+* Can eliminate unbacked symbols by replacing them with equivalent expressions
+* Refines value ranges of symbols
+* Remembers boolean expressions that are always true
+
+Semantically, the function behaves like a conditional check:
+```python
+if not cond:
+    raise RuntimeError(msg_fn())
+```
+
+But there a number of key differences:
+
+* The condition is always assumed true at compile time, even if it involves unbacked `SymInts`. The actual check is deferred to runtime, avoiding
+compile-time errors. Instead of setting up a guard, we implement a
+deferred runtime assertion to verify the condition at runtime. At compile
+time, we assume the condition won't trigger an error, so we don't need
+to determine if it evaluates to `True` or `False`.
+
+* If you perform an equality test `u0 = RHS`, we try to replace all instances
+of `u0` with RHS. We will ALWAYS do this if RHS has no unbacked symbols,
+as removing unbacked symbols is beneficial—eliminating them prevents
+the creation of a `GuardOnDataDependentSymNode`. Even if we are not able
+to eliminate u0, we can refine its value range. The value range specifies
+what the set of possible values for a variable are. By default, size-like
+unbacked SymInts have a value range of `[0, Inf]`; if you assert it is
+equal to an expression with a refined value range, say `[2, 20]`, then
+`u0`’s value range will be updated to `[2, 20]`. We also have limited
+support for propagating value ranges in reverse.
+
+* If you perform a boolean test `f(u0)`, we will remember that this expression always evaluates to True, and if you evaluate an expression that contains this expression, we will substitute it with True. We also support some limited reasoning on logically equivalent statements. For example, if you `torch._check(u0 < 4)`, we will also know that `u0 >= 4` evaluates to `False`, and so performing a test like this in a normal non-check conditional will go through fine.
+
+
+### `torch._check_is_size(size)` and `guard_size_oblivious(cond)`
+
+Example:
+```python
+u0 = y.item()
+torch._check_is_size(u0)
+```
+
+**Semantic Equivalent:**
+
+```python
+if u0 < 0:
+    raise RuntimeError("u0 is not a size")
+```
+
+**Key Differences:**
+
+Like `torch._check`, this test will always succeed at compile time, and it will establish that `u0 >= 0`. This refines the value range of `u0` to `[0, Inf]` instead of `[-Inf, Inf]`.
+
+Marking `u0` as size-like is crucial. Size-like unbacked `SymInts` behave like
+their regular counterparts, except when involved in a boolean expression
+evaluated with `guard_size_oblivious`. In such cases, they are assumed not to equal zero or one, temporarily setting their value range to `[2, Inf]`. For instance, a conditional check like `u0 == 1` will evaluate to `False` when `u0` is size-like, instead of causing an error.
+
+For example, `guard_size_oblivious(u0 == 1)` will always return `False` when `u0`
+is size-like.
+
+Marking unbacked symbols as size-like is essential in contexts where tensor
+sizes are expected. PyTorch internals often check if sizes are zero or one to
+handle special cases related to empty or single-element tensors. If you pass an
+unbacked symbol to a factory function like `torch.empty`, it will automatically
+be marked as size-like. However, some quantities, like arguments to `Tensor.view`,
+cannot be inferred as size-like because `-1` is a valid argument. In such cases,
+you need to explicitly use `torch._check_is_size` on an unbacked `SymInt` before
+passing it to `view`.
+
+In PyTorch framework code, if you need to test a size for zero or one, wrap the
+test in `guard_size_oblivious` to assume that size-like unbacked `SymInts` will
+not pass this test. Generally, most framework code has logic for the `>= 2`
+case, which works for the `0/1` case. If using `guard_size_oblivious` in
+PyTorch framework code resolves your issue, it's likely acceptable. However,
+avoid using `guard_size_oblivious` in user code, especially if different
+behavior is required for the `0/1` case at runtime, such as in a
+hand-tracking application.
+
+In C++, this can be done with `TORCH_GUARD_SIZE_OBLIVIOUS(u0.sym_eq(0))`, for example.
+
+### torch._check_is_size(size, max=upper_bound) (New)
+
+This function is semantically equivalent to `torch._check(size <= upper_bound)`.
+However, under `guard_size_oblivious`, it assumes that `size < upper_bound`.
+This functionality only works when the upper bound is an integer constant. If
+`upper_bound` is a symbolic expression, normal semantics apply. There is
+potential to extend this functionality to symbolic expressions with further
+development.
+
+For more details, see the related issue https://github.com/pytorch/pytorch/issues/120288.
+
+
+### `torch._constrain_as_value` and `torch._constrain_as_size`
+
+These APIs are more specialized and are effectively equivalent to
+`torch._check` and `torch._check_is_size`, with the added capability
+of adjusting the value range of a variable by specifying minimum and
+maximum values. However, in recommendation models, these functions are
+unlikely to resolve `GuardOnDataDependentSymNode` errors effectively.
+
+While `constrain_as_value` might seem like a convenient way to ensure a
+variable stays within the bounds of another tensor, it is often impractical.
+This is because value ranges only support constant bounds, and it's common
+for the tensor you want to index into to have a symbolic dimension (for
+example, `s0`). Using its size as the maximum value for a value range
+will force specialization, which is usually undesirable. Instead, if
+necessary, manually handle range checks by using `torch._check()` on
+appropriate expressions based on the errors you encounter.
+
+## Common Fix Patterns
+
+There are several common methods to resolve issues like this. Below,
+we outline the most frequently used solutions.
+
+### When It's Unfixable
+
+In some cases, the issue is genuinely unfixable due to the nature of the code.
+Consider the following example:
+
+```python
+i = x.item()
+if i > 4:
+  return x * 2
+else:
+  return x + 3
+```
+
+If the user code is branching on a data-dependent value, it is impossible to
+trace as is. In such cases, you may need to consider alternative approaches,
+such as using `torch.cond`.
+
+Another common pattern involves indexing with a data-dependent value:
+
+```python
+return self.mlps[x.item()]
+```
+
+Here, `self.mlps` is a Python list or `ModuleList`, and the code branches on a data-dependent value. The simplest solution is to induce a graph break before the indexing operation.
+
+### `u0` is a Size, but We Don’t Know It
+
+Some guards fail on tests that essentially ask, "Is this a size?" but we don't know it is a size. These fall into two categories:
+
+1. **Regular Tests:**
+
+   These are tests like `u0 >= 0` or `u0 != -1` that are unconditionally true
+   for sizes. Adding a `torch._check_is_size(...)` on the relevant size will
+   assert that these tests are true. This is typically uncommon because if
+   the test is for error checking, we can infer that the condition must be
+   true, as an error would occur otherwise. An important exception is APIs
+   that accept both sizes and `-1`; in such cases, the user must indicate that
+   the input data-dependent quantity cannot be `-1`, as something unusual would
+   happen otherwise. For an example, see
+   https://github.com/pytorch/pytorch/pull/107788.
+
+   Sometimes, you can refactor an error-checking API to split a logical
+   disjunction of conditionals into separate conditionals. If you can do so
+   to achieve a single `torch._check(x == y)` statement, it will enable
+   the automatic generation of a deferred runtime assertion. For an example,
+   see https://github.com/pytorch/pytorch/pull/110979.
+
+2. **Edge Case Tests:**
+
+   These are tests like `u0 == 0` or `u0 == 1`, which are not always true for
+   sizes, but where our choice doesn’t really matter. These tests handle edge
+   cases, such as dealing with an empty tensor or testing for broadcasting when
+   we want to assume broadcasting is not occurring. To resolve these situations,
+   two steps are needed:
+
+   * First, the guard itself must be evaluated via `guard_size_oblivious`,
+   which assumes that size-like integers cannot equal zero or one, with the
+   promise that if they do, something reasonable will happen.
+   * Second, the symbols themselves must be marked as size-like, either
+   inferred because they were passed to tensor factory functions or explicitly
+   specified with `torch._check_is_size(...)`. For examples of making guards
+   size-oblivious, see https://github.com/pytorch/pytorch/pull/118579.
+
+Sometimes, these tests can occur in C++. While there are corresponding
+C++ APIs for these tests, it can be more challenging to localize the problem,
+as you do not get a useful backtrace by default.
+
+### `u0` is Actually Equal to `u1`, but We Don’t Know It
+
+Multiple unbacked `SymInts` can be known to be equal at compile time:
+
+```python
+i0 = x.sum().item()
+i1 = x.sum().item()
+return torch.randn(i0) + torch.randn(i1)
+```
+
+If there is a `torch._check(i0 == i1)` somewhere (in the example above, this
+check would occur inside the shape-checking rule for addition), we will
+automatically unify the two unbacked `SymInts` and recognize them as equal.
+However, if such an assertion is missing, you may need to explicitly add an
+assertion to achieve this unification. For an example, see
+https://github.com/pytorch/pytorch/issues/111950).
+
+```{note}
+If we allocate an unbacked `SymInt` and
+immediately set it equal to another, these instances are benign and not easily
+eliminated entirely from the framework.
+```
+
+### `u0` is a Tensor
+
+Another reason you might be overallocating unbacked `SymInts` is due to passing
+around a `Tensor` and relying on its implicit conversion to an integer. Many
+functions that accept an integer will also accept a `Tensor` and automatically
+call `item()` on the integer argument. It's beneficial to examine
+`TORCH_LOGS=dynamic` to determine whether the number of unbacked `SymInts` is
+as expected or excessive. When this occurs, a new `SymInt` will be allocated at
+the line where a PyTorch function is invoked.
+
+This issue is less likely to cause problems now because the return value of
+`t.item()` is memoized, ensuring that you consistently receive the same unbacked
+`SymInt` if you call it multiple times.
+
+### Overspecialization Issue
+
+In non-strict export mode, consider the following code:
+
+```python
+u0 = x.sum().item() return y[:u0]
+```
+
+This code will fail when trying to evaluate `u0` because, when a `SymInt` is
+used directly inside a Python slice (without using Dynamo), Python forces the
+integer to be specialized and fails if it is unbacked.
+
+To resolve this, you can rewrite the program to avoid specialization.
+For the example above, you can fix it by not using slices:
+
+```python
+u0 = x.sum().item() return y.narrow(0, 0, u0)
+```
+
+For more details, see the related issue
+https://github.com/pytorch/pytorch/issues/111950.
+
+### Use Lengths Instead of Offsets
+
+When working with variable sequence lengths, it's common to have tensors
+representing either the lengths or offsets of the sequences. For example, given
+`values = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]`, you might have `lengths = [3, 2, 4]`
+and `offsets = [0, 3, 5, 9]`. While these representations are interconvertible,
+it's better to work with lengths when dealing with them as integers (by calling
+`lengths.tolist()`), rather than offsets.
+
+The reason is that when you perform a `torch.split()` on your `values` tensor, you
+need to create tensors for each sub-sequence, such as tensors of sizes 3, 2, and 4.
+If you have unbacked `SymInts` for sizes, they become `u0`, `u1`, and `u2`. You can
+easily indicate that they are size-like, and you're done. However, if you have
+unbacked `SymInts` for offsets, they become `u1 - u0`, `u2 - u1`, `u3 - u2`, which
+complicates matters. These quantities cannot be conveniently marked as size-like,
+leading to potential issues. Since it's relatively straightforward to write code
+using either lengths or offsets, you should prefer using lengths.
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`debugging-tlparse-torch-logs`
+```
diff --git a/docs/source/compile/dynamic_shapes_zero_one_specialization.md b/docs/source/compile/dynamic_shapes_zero_one_specialization.md
new file mode 100644
index 0000000000000..0ea3e52371559
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_zero_one_specialization.md
@@ -0,0 +1,33 @@
+(zero-one-specialization)=
+# The Zero-One Specialization Problem
+
+Before you read this section, you should understand the basics of
+dynamic shapes. Make sure you have read the following sections:
+
+* {ref}`dynamic_shapes`
+* {ref}`torch.export`
+* {ref}`what_is_a_specialization`
+
+In `torch.compile`, we specialize automatically on inputs with sizes
+0 or 1 and assume that any remaining inputs cannot be 0 or 1. This
+simplifies tasks like contiguity and broadcasting checks, as it
+avoids adding extra guards. However, this can cause problems for
+sparse models with many symbolic integers that in practice have
+tensors of size 0, 1, or 2. For example, consider when you a task is
+something like collecting likes on page.
+
+While it's possible to stop specializing on 0/1 upfront, executing
+normal PyTorch code often reintroduces 0/1 guards, as many conditions
+in PyTorch check for values being 0 or 1. Although models that work
+for `N > 2` often generalize to `N = 1`, this isn't guaranteed, especially
+with symbolic variables. For example, in hand tracking, a dimension
+size of `N = 0`, `1`, or `2` may lead to different graph behaviors.
+Simply hoping that the `N > 2` model generalizes can expose soundness issues.
+
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`torch.export`
+* {ref}`what_is_a_specialization`
+* {ref}`backed-vs-unbacked-symints`
+```
diff --git a/docs/source/compile/programming_model.graph_breaks_index.md b/docs/source/compile/programming_model.graph_breaks_index.md
index c46e6012d29a1..096e60838672f 100644
--- a/docs/source/compile/programming_model.graph_breaks_index.md
+++ b/docs/source/compile/programming_model.graph_breaks_index.md
@@ -1,6 +1,6 @@
 # Working with Graph Breaks
 
-As you might remember from (Dynamo Core Concepts)[programming_model.dynamo_core_concepts] that Dynamo performs a graph break when
+As you might remember from [Dynamo Core Concepts](programming_model.dynamo_core_concepts) that Dynamo performs a graph break when
 it encounters code that can't be traced. In the default `torch.compile` settings, Dynamo compiles the FX graph
 that has been determined up to that point, executes the unsupported code in regular Python, and then resumes tracing.
 
diff --git a/docs/source/cond.md b/docs/source/cond.md
index 0765d59dae7fd..49722fd3b9676 100644
--- a/docs/source/cond.md
+++ b/docs/source/cond.md
@@ -34,75 +34,75 @@ Read more about feature classification at: https://pytorch.org/blog/pytorch-feat
 Below is an example that uses cond to branch based on input shape:
 
 ```python
-    import torch
+import torch
 
-    def true_fn(x: torch.Tensor):
-        return x.cos() + x.sin()
+def true_fn(x: torch.Tensor):
+    return x.cos() + x.sin()
 
-    def false_fn(x: torch.Tensor):
-        return x.sin()
+def false_fn(x: torch.Tensor):
+    return x.sin()
 
-    class DynamicShapeCondPredicate(torch.nn.Module):
-        """
-        A basic usage of cond based on dynamic shape predicate.
-        """
+class DynamicShapeCondPredicate(torch.nn.Module):
+    """
+    A basic usage of cond based on dynamic shape predicate.
+    """
 
-        def __init__(self):
-            super().__init__()
+    def __init__(self):
+        super().__init__()
 
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            def true_fn(x: torch.Tensor):
-                return x.cos()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        def true_fn(x: torch.Tensor):
+            return x.cos()
 
-            def false_fn(x: torch.Tensor):
-                return x.sin()
+        def false_fn(x: torch.Tensor):
+            return x.sin()
 
-            return torch.cond(x.shape[0] > 4, true_fn, false_fn, (x,))
+        return torch.cond(x.shape[0] > 4, true_fn, false_fn, (x,))
 
-    dyn_shape_mod = DynamicShapeCondPredicate()
+dyn_shape_mod = DynamicShapeCondPredicate()
 ```
 
 We can eagerly run the model and expect the results vary based on input shape:
 
 ```python
-    inp = torch.randn(3)
-    inp2 = torch.randn(5)
-    assert torch.equal(dyn_shape_mod(inp), false_fn(inp))
-    assert torch.equal(dyn_shape_mod(inp2), true_fn(inp2))
+inp = torch.randn(3)
+inp2 = torch.randn(5)
+assert torch.equal(dyn_shape_mod(inp), false_fn(inp))
+assert torch.equal(dyn_shape_mod(inp2), true_fn(inp2))
 ```
 
 We can export the model for further transformations and deployment:
 
 ```python
-    inp = torch.randn(4, 3)
-    dim_batch = torch.export.Dim("batch", min=2)
-    ep = torch.export.export(DynamicShapeCondPredicate(), (inp,), {}, dynamic_shapes={"x": {0: dim_batch}})
-    print(ep)
+inp = torch.randn(4, 3)
+dim_batch = torch.export.Dim("batch", min=2)
+ep = torch.export.export(DynamicShapeCondPredicate(), (inp,), {}, dynamic_shapes={"x": {0: dim_batch}})
+print(ep)
 ```
 
 This gives us an exported program as shown below:
 
 ```
-    class GraphModule(torch.nn.Module):
+class GraphModule(torch.nn.Module):
+    def forward(self, arg0_1: f32[s0, 3]):
+        sym_size: Sym(s0) = torch.ops.aten.sym_size.int(arg0_1, 0)
+        gt: Sym(s0 > 4) = sym_size > 4;  sym_size = None
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+        return (conditional,)
+
+    class <lambda>(torch.nn.Module):
         def forward(self, arg0_1: f32[s0, 3]):
-            sym_size: Sym(s0) = torch.ops.aten.sym_size.int(arg0_1, 0)
-            gt: Sym(s0 > 4) = sym_size > 4;  sym_size = None
-            true_graph_0 = self.true_graph_0
-            false_graph_0 = self.false_graph_0
-            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
-            return (conditional,)
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
-                return add
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                return sin
+            cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+            return add
+
+    class <lambda>(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            return sin
 ```
 
 Notice that `torch.cond` is lowered to `torch.ops.higher_order.cond`, its predicate becomes a Symbolic expression over the shape of input,
@@ -111,41 +111,41 @@ and branch functions becomes two sub-graph attributes of the top level graph mod
 Here is another example that showcases how to express a data-dependent control flow:
 
 ```python
-    class DataDependentCondPredicate(torch.nn.Module):
-        """
-        A basic usage of cond based on data dependent predicate.
-        """
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            return torch.cond(x.sum() > 4.0, true_fn, false_fn, (x,))
+class DataDependentCondPredicate(torch.nn.Module):
+    """
+    A basic usage of cond based on data dependent predicate.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.cond(x.sum() > 4.0, true_fn, false_fn, (x,))
 ```
 
 The exported program we get after export:
 
 ```
-    class GraphModule(torch.nn.Module):
+class GraphModule(torch.nn.Module):
+    def forward(self, arg0_1: f32[s0, 3]):
+        sum_1: f32[] = torch.ops.aten.sum.default(arg0_1)
+        gt: b8[] = torch.ops.aten.gt.Scalar(sum_1, 4.0);  sum_1 = None
+
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+        return (conditional,)
+
+    class <lambda>(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+            return add
+
+    class <lambda>(torch.nn.Module):
         def forward(self, arg0_1: f32[s0, 3]):
-            sum_1: f32[] = torch.ops.aten.sum.default(arg0_1)
-            gt: b8[] = torch.ops.aten.gt.Scalar(sum_1, 4.0);  sum_1 = None
-
-            true_graph_0 = self.true_graph_0
-            false_graph_0 = self.false_graph_0
-            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
-            return (conditional,)
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
-                return add
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                return sin
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            return sin
 ```
 
 ## Invariants of torch.ops.higher_order.cond
diff --git a/docs/source/conf.py b/docs/source/conf.py
index d1504757f9c54..65e9932d61cdf 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -133,7 +133,7 @@
 html_theme_options = {
     "logo": {"text": "Home"},
     "analytics_id": "GTM-T8XT4PS",
-    "canonical_url": "https://pytorch.org/docs/stable/",
+    "canonical_url": "https://docs.pytorch.org/docs/stable/",
     "switcher": {
         "json_url": "https://docs.pytorch.org/docs/pytorch-versions.json",
         "version_match": switcher_version,
@@ -143,7 +143,7 @@
     "external_links": [
         {
             "name": "Tutorials",
-            "url": "https://pytorch.org/tutorials/",
+            "url": "https://docs.pytorch.org/tutorials/",
         },
     ],
     "show_version_warning_banner": True,
@@ -210,10 +210,6 @@
 coverage_ignore_functions = [
     # torch
     "typename",
-    # torch.cuda
-    "check_error",
-    "cudart",
-    "is_bf16_supported",
     # torch.cuda._sanitizer
     "zip_arguments",
     "zip_by_key",
@@ -513,10 +509,6 @@
     "custom_fwd",
     # torch.cuda.amp.common
     "amp_definitely_not_available",
-    # torch.cuda.graphs
-    "graph_pool_handle",
-    "is_current_stream_capturing",
-    "make_graphed_callables",
     # torch.mtia.memory
     "reset_peak_memory_stats",
     # torch.cuda.nccl
@@ -528,25 +520,11 @@
     "reduce_scatter",
     "unique_id",
     "version",
-    # torch.cuda.nvtx
-    "range",
-    "range_end",
-    "range_start",
     # torch.cuda.profiler
     "init",
     "profile",
     "start",
     "stop",
-    # torch.cuda.random
-    "get_rng_state",
-    "get_rng_state_all",
-    "initial_seed",
-    "manual_seed",
-    "manual_seed_all",
-    "seed",
-    "seed_all",
-    "set_rng_state",
-    "set_rng_state_all",
     # torch.distributed.algorithms.ddp_comm_hooks.ddp_zero_hook
     "hook_with_zero_step",
     "hook_with_zero_step_interleaved",
@@ -2176,8 +2154,6 @@
     "EventHandler",
     "SynchronizationError",
     "UnsynchronizedAccessError",
-    # torch.cuda.memory
-    "MemPool",
     # torch.distributed.elastic.multiprocessing.errors
     "ChildFailedError",
     "ProcessFailure",
@@ -2483,10 +2459,6 @@
     # torch.amp.grad_scaler
     "GradScaler",
     "OptState",
-    # torch.cuda.graphs
-    "CUDAGraph",
-    # torch.cuda.streams
-    "Event",
     # torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook
     "PostLocalSGDState",
     # torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
diff --git a/docs/source/cuda.aliases.md b/docs/source/cuda.aliases.md
new file mode 100644
index 0000000000000..023b88db29e5b
--- /dev/null
+++ b/docs/source/cuda.aliases.md
@@ -0,0 +1,47 @@
+# Aliases in torch.cuda
+
+The following are aliases to their counterparts in ``torch.cuda`` in the nested namespaces in which they are defined. For any of these APIs, feel free to use the top-level version in ``torch.cuda`` like ``torch.cuda.seed`` or the nested version ``torch.cuda.random.seed``.
+
+```{eval-rst}
+.. automodule:: torch.cuda.random
+.. currentmodule:: torch.cuda.random
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    get_rng_state
+    get_rng_state_all
+    set_rng_state
+    set_rng_state_all
+    manual_seed
+    manual_seed_all
+    seed
+    seed_all
+    initial_seed
+```
+
+```{eval-rst}
+.. automodule:: torch.cuda.graphs
+.. currentmodule:: torch.cuda.graphs
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    is_current_stream_capturing
+    graph_pool_handle
+    CUDAGraph
+    graph
+    make_graphed_callables
+```
+
+```{eval-rst}
+.. automodule:: torch.cuda.streams
+.. currentmodule:: torch.cuda.streams
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Stream
+    ExternalStream
+    Event
+```
\ No newline at end of file
diff --git a/docs/source/cuda.md b/docs/source/cuda.md
index 24830cacdd4f6..09cf443cf0678 100644
--- a/docs/source/cuda.md
+++ b/docs/source/cuda.md
@@ -15,6 +15,7 @@
 
     StreamContext
     can_device_access_peer
+    check_error
     current_blas_handle
     current_device
     current_stream
@@ -34,6 +35,7 @@
     init
     ipc_collect
     is_available
+    is_bf16_supported
     is_initialized
     is_tf32_supported
     memory_usage
@@ -272,10 +274,6 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
 .. py:module:: torch.cuda.gds
 ```
 
-```{eval-rst}
-.. py:module:: torch.cuda.graphs
-```
-
 ```{eval-rst}
 .. py:module:: torch.cuda.jiterator
 ```
@@ -292,14 +290,13 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
 .. py:module:: torch.cuda.profiler
 ```
 
-```{eval-rst}
-.. py:module:: torch.cuda.random
-```
-
 ```{eval-rst}
 .. py:module:: torch.cuda.sparse
 ```
 
 ```{eval-rst}
-.. py:module:: torch.cuda.streams
-```
+.. toctree::
+    :hidden:
+
+    cuda.aliases.md
+```
\ No newline at end of file
diff --git a/docs/source/distributed.tensor.md b/docs/source/distributed.tensor.md
index cb12eb195c02c..3d65399727e17 100644
--- a/docs/source/distributed.tensor.md
+++ b/docs/source/distributed.tensor.md
@@ -260,3 +260,73 @@ these features.
 ```{eval-rst}
 .. py:module:: torch.distributed.tensor.device_mesh
 ```
+
+## Mixed Tensor and DTensor operations
+
+So you got the following error message.
+```
+got mixed torch.Tensor and DTensor, need to convert all
+torch.Tensor to DTensor before calling distributed operators!
+```
+
+There are two cases.
+
+### Case 1: this is user error
+
+The most common way to run into this error is to create a regular Tensor
+(using a factory function) and then perform a Tensor-DTensor operation,
+like the following:
+
+```
+tensor = torch.arange(10)
+return tensor + dtensor
+```
+
+We disallow mixed Tensor-DTensor operations: if the input to any operations
+(e.g. torch.add) is a DTensor, then all Tensor inputs must be DTensors.
+This is because the semantics are ambiguous. We don't know if `tensor` is
+the same across ranks or if it is different so we ask that the user
+figure out how to construct a DTensor with accurate placements from `tensor`.
+
+If each rank does have the same `tensor`, then please construct a replicated
+DTensor:
+
+```
+tensor = torch.arange(10)
+tensor = DTensor.from_local(tensor, placements=(Replicate(),))
+return tensor + dtensor
+```
+
+If you wanted to create a DTensor with shards, below is how to do it.
+Semantically this means that your Tensor data is split between the shards
+and that operations act on the "full stacked data".
+
+```
+tensor = torch.full([], RANK)
+tensor = DTensor.from_local(tensor, placements=(Shard(0),))
+return tensor + dtensor
+```
+
+There are other things you may wish to do with your tensor beyond
+these situations (these are not the only two options!).
+
+## Case 2: the error came from PyTorch framework code
+
+Sometimes the problem is that PyTorch framework code attempts to perform mixed
+Tensor-DTensor operations. These are bugs in PyTorch, please file an issue
+so that we can fix them.
+
+On the user side, the only thing you can do is to avoid using the operation
+that caused the issue and file a bug report.
+
+For PyTorch Developers: one approach of fixing this is to rewrite PyTorch
+framework code to avoid mixed Tensor-DTensor code (like in the previous section).
+
+For PyTorch Developers: the second approach is to turn on DTensor implicit
+replication inside the right places in PyTorch framework code.
+When on, any mixed Tensor-DTensor operations will assume that the
+non-DTensors can be replicated. Please be careful when using this as it
+can lead to silent incorrectness.
+
+- [Turning on implicit replication in Python](https://github.com/pytorch/pytorch/blob/d8e6b2fddc54c748d976e8f0ebe4b63ebe36d85b/torch/distributed/tensor/experimental/__init__.py#L15)
+- [Turning on implicit replication in C++](https://github.com/pytorch/pytorch/blob/7a0f93344e2c851b9bcf2b9c3225a323d48fde26/aten/src/ATen/DTensorState.h#L10)
diff --git a/docs/source/distributed.tensor.parallel.md b/docs/source/distributed.tensor.parallel.md
index 6083699493ff0..fbfb6f1be2b8c 100644
--- a/docs/source/distributed.tensor.parallel.md
+++ b/docs/source/distributed.tensor.parallel.md
@@ -5,7 +5,7 @@
 # Tensor Parallelism - torch.distributed.tensor.parallel
 
 Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor
-(DTensor)[https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md]
+([DTensor](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md))
 and provides different parallelism styles: Colwise, Rowwise, and Sequence Parallelism.
 
 :::{warning}
@@ -89,4 +89,4 @@ Parallelized cross-entropy loss computation (loss parallelism), is supported via
 ```
 :::{warning}
     The loss_parallel API is experimental and subject to change.
-:::
\ No newline at end of file
+:::
diff --git a/docs/source/fx.experimental.md b/docs/source/fx.experimental.md
index 24125cd310bc4..cba695b5e1c55 100644
--- a/docs/source/fx.experimental.md
+++ b/docs/source/fx.experimental.md
@@ -8,6 +8,10 @@
 These APIs are experimental and subject to change without notice.
 :::
 
+```{eval-rst}
+.. autoclass:: torch.fx.experimental.sym_node.DynamicInt
+```
+
 ## torch.fx.experimental.symbolic_shapes
 
 ```{eval-rst}
diff --git a/docs/source/mtia.md b/docs/source/mtia.md
index 3229b80c3d91b..b0644a2ec6682 100644
--- a/docs/source/mtia.md
+++ b/docs/source/mtia.md
@@ -22,6 +22,7 @@ The MTIA backend is implemented out of the tree, only interfaces are be defined
     device_count
     init
     is_available
+    is_bf16_supported
     is_initialized
     memory_stats
     get_device_capability
diff --git a/docs/source/nativert.rst b/docs/source/nativert.rst
new file mode 100644
index 0000000000000..3576573c9239e
--- /dev/null
+++ b/docs/source/nativert.rst
@@ -0,0 +1,17 @@
+torch.nativert
+==============
+
+.. automodule:: torch.nativert
+.. currentmodule:: torch.nativert
+
+.. py:module:: torch.nativert
+   :noindex:
+
+torch.nativert.backends
+-----------------------
+
+.. automodule:: torch.nativert.backends
+.. currentmodule:: torch.nativert.backends
+
+.. py:module:: torch.nativert.backends
+   :noindex:
diff --git a/docs/source/nn.attention.flex_attention.md b/docs/source/nn.attention.flex_attention.md
index 4cfb51c5945c0..8c51cee276514 100644
--- a/docs/source/nn.attention.flex_attention.md
+++ b/docs/source/nn.attention.flex_attention.md
@@ -30,9 +30,6 @@
 .. autofunction:: create_mask
 ```
 ```{eval-rst}
-.. autofunction:: create_nested_block_mask
-```
-```{eval-rst}
 .. autofunction:: and_masks
 ```
 ```{eval-rst}
diff --git a/docs/source/onnx.md b/docs/source/onnx.md
index 73a24b671553c..8310b2aa71302 100644
--- a/docs/source/onnx.md
+++ b/docs/source/onnx.md
@@ -102,6 +102,7 @@ also be interested in reading our [development wiki](https://github.com/pytorch/
     onnx_export
     onnx_ops
     onnx_verification
+    onnx_testing
 ```
 
 ### Deprecated APIs
diff --git a/docs/source/onnx_testing.md b/docs/source/onnx_testing.md
new file mode 100644
index 0000000000000..d8da35ff08922
--- /dev/null
+++ b/docs/source/onnx_testing.md
@@ -0,0 +1,9 @@
+# torch.onnx.testing
+
+```{eval-rst}
+.. automodule:: torch.onnx.testing
+```
+
+```{eval-rst}
+.. autofunction:: torch.onnx.testing.assert_onnx_program
+```
diff --git a/docs/source/pytorch-api.md b/docs/source/pytorch-api.md
index 6ebf94c47a357..3b3f0f627bdd6 100644
--- a/docs/source/pytorch-api.md
+++ b/docs/source/pytorch-api.md
@@ -56,6 +56,7 @@ torch.monitor <monitor>
 torch.signal <signal>
 torch.special <special>
 torch.overrides
+torch.nativert <nativert>
 torch.package <package>
 profiler
 nn.init
@@ -75,7 +76,6 @@ storage
 torch.testing <testing>
 torch.utils <utils>
 torch.utils.benchmark <benchmark_utils>
-torch.utils.bottleneck <bottleneck>
 torch.utils.checkpoint <checkpoint>
 torch.utils.cpp_extension <cpp_extension>
 torch.utils.data <data>
diff --git a/docs/source/torch.compiler.md b/docs/source/torch.compiler.md
index fe37c3b42aea5..11e22aae4cf3f 100644
--- a/docs/source/torch.compiler.md
+++ b/docs/source/torch.compiler.md
@@ -82,55 +82,48 @@ Some of the most commonly used backends include:
 
 ## Read More
 
-```{eval-rst}
-.. toctree::
-   :caption: Getting Started for PyTorch Users
-   :maxdepth: 1
-
-   torch.compiler_get_started
-   torch.compiler_api
-   torch.compiler.config
-   torch.compiler_fine_grain_apis
-   torch.compiler_backward
-   torch.compiler_aot_inductor
-   torch.compiler_inductor_profiling
-   torch.compiler_profiling_torch_compile
-   torch.compiler_faq
-   torch.compiler_troubleshooting
-   torch.compiler_performance_dashboard
-   torch.compiler_inductor_provenance
+```{toctree}
+:caption: Getting Started for PyTorch Users
+:maxdepth: 2
+
+torch.compiler_get_started
+torch.compiler_api
+torch.compiler.config
+torch.compiler_dynamic_shapes
+torch.compiler_fine_grain_apis
+torch.compiler_backward
+torch.compiler_aot_inductor
+torch.compiler_inductor_profiling
+torch.compiler_profiling_torch_compile
+torch.compiler_faq
+torch.compiler_troubleshooting
+torch.compiler_performance_dashboard
+torch.compiler_inductor_provenance
 ```
 
-```{eval-rst}
-.. toctree::
-   :caption: `torch.compile` Programming Model
+```{toctree}
+:caption: torch.compile Programming Model
+:maxdepth: 2
 
-   compile/programming_model
+compile/programming_model
 ```
 
-% _If you want to contribute a developer-level topic
-%  that provides in-depth overview of a torch._dynamo feature,
-%  add in the below toc.
+```{toctree}
+:caption: Deep Dive for PyTorch Developers
+:maxdepth: 1
 
-```{eval-rst}
-.. toctree::
-   :caption: Deep Dive for PyTorch Developers
-   :maxdepth: 1
-
-   torch.compiler_dynamo_overview
-   torch.compiler_dynamo_deepdive
-   torch.compiler_dynamic_shapes
-   torch.compiler_nn_module
-   torch.compiler_cudagraph_trees
-   torch.compiler_fake_tensor
+torch.compiler_dynamo_overview
+torch.compiler_dynamo_deepdive
+torch.compiler_nn_module
+torch.compiler_cudagraph_trees
+torch.compiler_fake_tensor
 ```
 
-```{eval-rst}
-.. toctree::
-   :caption: HowTo for PyTorch Backend Vendors
-   :maxdepth: 1
+```{toctree}
+:caption: HowTo for PyTorch Backend Vendors
+:maxdepth: 1
 
-   torch.compiler_custom_backends
-   torch.compiler_transformations
-   torch.compiler_ir
+torch.compiler_custom_backends
+torch.compiler_transformations
+torch.compiler_ir
 ```
diff --git a/docs/source/torch.compiler_dynamic_shapes.md b/docs/source/torch.compiler_dynamic_shapes.md
index 95998ffe8491c..4905eefacc1d1 100644
--- a/docs/source/torch.compiler_dynamic_shapes.md
+++ b/docs/source/torch.compiler_dynamic_shapes.md
@@ -1,129 +1,295 @@
+---
+file_format: mystnb
+kernelspec:
+  name: python3
+mystnb:
+  execution_timeout: 30
+  execution_show_tb: True
+  merge_streams: True
+---
+
+```{code-cell}
+:tags: [remove-cell]
+import torch
+from compile import header_code
+
+torch._logging.set_logs(graph_breaks=True, graph_code=True)
+```
+
+(dynamic_shapes)=
 # Dynamic Shapes
 
-Code: [symbolic_shapes.py](https://github.com/pytorch/pytorch/blob/db4572dbf18f1cf50cf662547e272d3117063747/torch/fx/experimental/symbolic_shapes.py)
-
-See also: [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng)
-
-## Motivation
-
-Deep learning compilers commonly only work for static shapes, that is to say, they produced compiled programs which only work for a single specific configuration of input shapes, and must recompile if any input shape changes. This assumption works great for the majority of commonly run deep learning models today, but there are a few situations where it is insufficient:
-
-- Some dimensions, such as batch size or sequence length, may vary. For example, an inference service performing adaptive batching will execute inference requests with varying batch sizes depending on how many requests it received within its batching window. We may also want to consider padding out variable size sequences only to the maximum sequence length within a batch, which may vary from batch-to-batch.
-- Some models exhibit data-dependent output shapes, that is to say, the size of their outputs and intermediates may depend on the actual input data which may vary across runs. For example, detection models may first generate a variable number of potential bounding boxes before running a more expensive image recognition model to identify if the subject is in a bounding box. The number of bounding boxes is data dependent.
-- One particularly important case of data-dependent shapes occurs when dealing with sparse representations, such as sparse tensors, jagged tensors, and graph neural networks. In all of these cases, the amount of data to be processed depends on the sparse structure of the problem, which will typically vary in a data-dependent way.
-
-In supporting dynamic shapes, we chose not to support dynamic rank programs, e.g., programs whose inputs tensors change in dimensionality, as this pattern rarely occurs in real-world deep learning programs, and it avoids the need to reason inductively over symbolic lists of shapes.
-
-## Abridged public API
-
-The default dynamic behavior in PyTorch 2.1 is:
-
-- PT2 assumes everything is static by default
-- If we recompile because a size changed, we will instead attempt to recompile
-  that size as being dynamic (sizes that have changed are likely to change in
-  the future). This generalization may fail (e.g., because user code does a
-  conditional branch on the size in question or missing dynamic shapes support
-  in PT2). If you are trying to understand why PT2 has overspecialized some
-  code, run with `TORCH_LOGS=dynamic` and look for "eval" entries that say
-  when guards are added and why.
-- If you know ahead of time something will be dynamic, you can skip the first
-  recompile with `torch._dynamo.mark_dynamic(tensor, dim)`. If you know ahead of time
-  the `min` and `max` value this dimension can take, you can specify `torch._dynamo.mark_dynamic(tensor, dim, min=min, max=max)`
-- If you say `torch.compile(dynamic=False)`, we will turn off automatic
-  dynamic shapes on recompiles and always recompile for each distinct size.
-  Conversely, if you say `torch.compile(dynamic=True)`, we will try to make
-  everything as dynamic as possible. This is mostly useful for small
-  operators; if you try it on a big model it will (1) probably crash PT2 and (2) run slow for no good reason.
-- You can whitelist specific sources to be marked as dynamic using the
-  `TORCH_COMPILE_DYNAMIC_SOURCES` environment variable or by setting
-  `torch.compiler.config.dynamic_sources`. This is particularly useful for large
-  models with graph breaks, as you can maintain dynamism across graph breaks since
-  source names stay consistent. You can also use this to mark integers as dynamic.
-  The format is a comma-delimited list of source names, e.g., `"L['x'], L['y']"`.
-  You can also use regexes, e.g., `"L\['x.*'\], L\['y.*'\]")`.
-  This whitelist takes precedence over other flags like `dynamic=False`,
-  `force_nn_module_property_static_shapes`, and `force_parameter_static_shapes`.
-- Sometimes it can be cumbersome to find the right inputs to mark as dynamic. If
-  you're willing to take a performance hit for the first batch, one other affordable
-  option we have are the eager_then_compile stances which derive dynamism for you.
-  See [torch.compiler.set_stance](https://docs.pytorch.org/docs/stable/generated/torch.compiler.set_stance.html) for more details.
-
-## The Guard Model
-
-When considering how to add support for dynamic shapes to TorchDynamo and TorchInductor, we made a major design decision: in order to reuse decompositions and other preexisting code written in Python/C++ targeting the PyTorch API, we must be able to trace through dynamic shapes. Unlike a fully symbolic system which might capture both branches of a conditional, we always pick one branch and specialize our trace under the assumption that we only use this trace when we would have made the same choice for that branch in the future. To do this, we maintain a "hint" for every symbolic size saying what its concrete value is at compile time (as TorchDynamo is a just-in-time compiler, it always knows what the actual input sizes are.) When we perform a condition on a tensor, we simply consult the hint to find out which branch to take.
-
-This greatly simplifies the symbolic shape formulas we produce, but means we have a much more involved system for managing guards. Consider, for example, the following program:
-
-```python
-def f(x, y):
-    z = torch.cat([x, y])
-    if z.size(0) > 2:
-        return z.mul(2)
-    else:
-        return z.add(2)
+This section explains how to work with dynamic shapes in PyTorch, including how
+to debug and fix common errors, implement support for dynamic shapes in
+operators, and understand the underlying mechanisms.
+
+Dynamic shapes allow PyTorch models to handle inputs with varying dimensions
+without recompilation. This enables more flexible models that can process
+different batch sizes, sequence lengths, or image dimensions in a single
+compiled artifact. Dynamic shapes work by symbolically tracing tensor
+dimensions rather than using concrete values, creating a computation
+graph that adapts to different input shapes at runtime. By default,
+PyTorch assumes all input shapes to be static.
+
+Typically, deep learning compilers only support static shapes, requiring
+recompilation for input shape changes. While this approach covers many use cases,
+there are situations where this is insufficient:
+
+- **Variable Dimensions** - Batch sizes or sequence lengths vary, such as in
+adaptive batching.
+- **Data-Dependent Outputs** - Models produce outputs based on input data,
+like variable bounding boxes in detection models.
+- **Sparse Representations** - Processing depends on data-varying sparse structures,
+such as in sparse tensors, jagged tensors, and graph neural networks.
+
+Dynamic shapes do not support dynamic rank programs, programs which input tensors
+change in dimensionality, as this is uncommon and unnecessarily complex.
+
+
+## What does it mean for a size/integer to be dynamic?
+
+Dynamic shapes allow avoiding recompilations by making certain dimensions or integers
+dynamic. For example, if a function `f(x)` is compiled with a static size, it will need
+recompilation for different sizes:
+
+```{note}
+For simplicity, this example uses `@torch.compile(dynamic=True)`. Note, that
+this option is not recommended due to it being error prone.
+For a recommended way of enabling dynamic shapes, see {ref}`enable-dynamic-behavior`.
 ```
 
-The final IR we will compile with TorchInductor will either be `torch.cat([x, y]).add(2)` or `torch.cat([x, y]).mul(2)` (with the condition flattened away), but to determine which branch we are in, we would need to know the size of `z`, an intermediate. Because TorchDynamo must know upfront if a compiled trace is valid (we do not support bailouts, like some JIT compilers), we must be able to reduce `z.size(0)` as an expression in terms of the inputs, `x.size(0) + y.size(0)`. This is done by writing meta functions for all operators in PyTorch which can propagate size information to the output of a tensor without actually performing computation on the node.
+```{code-cell}
+import torch
+@torch.compile(dynamic=False)
+def f(x):
+     return x* x.size()[0]
 
-## Overall architecture
+f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+```
+
+In the produced output, you can see that four graphs were generated.
+See the corresponding <a href="_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png" target="_blank">tlparse output</a>
 
-Symbolic shapes workflow:
+By making the size dynamic, the function can handle various sizes without recompilation:
 
-1. When we start compiling a frame in Dynamo, we allocate a ShapeEnv (attached to FakeTensorMode) which keeps track of symbolic shapes state.
-2. We allocate symbolic sizes for tensors on entry (what is static or dynamic is a policy decision, with some knobs).
-3. We propagate the symbolic sizes through operators, maintaining both (1) FX IR so that we can faithfully export symbolic compute, and (2) Sympy expressions representing the size vars, so we can reason about them.
-4. When we condition on symbolic sizes, either in Dynamo tracing or in Inductor optimization, we add guards based on the conditional. These can be induced from both Python and C++.
-5. These guards can induce further simplifications on symbolic variables. For example, if you assert `s0 == 4`, we can now replace all occurrences of `s0` with `4`.
-6. When we're done tracing and optimizing, we install all of these guards with the compiled code; the compiled code is only reusable if all the guards evaluate true.
+```{code-cell}
+import torch
+@torch.compile(dynamic=True)
+def f(x):
+     return x* x.size()[0]
 
-Important files:
+f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+```
 
-- C++ SymInt API: `c10/core/SymInt.h`, `SymFloat.h`, `SymBool.h`
-- Python SymInt API: `torch/__init__.py` (look for `SymInt/SymFloat/SymBool`)
-- C++ plumbing: `c10/core/SymNodeImpl.h`, `torch/csrc/utils/python_symnode.h`, `torch/csrc/jit/python/init.cpp`
-- Python infrastructure: `torch/fx/experimental/symbolic_shapes.py`
-- Other important files: `torch/_subclasses/fake_tensor.py`, `torch/_meta_registrations.py`, decomps, PrimTorch refs
+With dynamic shapes enabled, only one graph is created. See the
+corresponding <a href="_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png" target="_blank">tlparse output</a>.
 
-## Abridged internal API
+While compilation time differences
+are minimal for this small example, more complex use cases would show significant
+performance improvements.
 
-Understanding the Python class hierarchy:
+(what_is_a_specialization)=
+## What is a specialization?
 
-- SymInt/SymFloat/SymBool: these are user-visible classes that simulate their int/float/bool counterparts. If you add two SymInts, we give you a new SymInt that symbolically tracks that the integer addition had occurred.
-- SymNode: this is the internal structure (accessible via e.g., `symint.node`) which holds the actual symbolic tracking info. SymNode is type erased; this makes it more convenient to represent mixed-type operations. Note that technically you don't have to call into Python SymNode from SymInt; for example, XLA's C++ `SymNodeImpl` would take the place of SymNode.
-- ShapeEnv: per-compile context state which keeps track of all the free symbols and guards we have accumulated so far. Every SymNode records its ShapeEnv (but not vice versa; SymNodes only get used if they participate in a guard).
+**Specialization** refers to optimizing a computational graph for specific input shapes
+by examining shape conditions during control flow. If a branch is taken based on a
+shape condition, the graph is tailored for that condition. If a new input doesn't meet
+this condition, the system will recompile the graph.
 
-C++ is fairly similar:
+Specialization allows you to create optimized computational graphs for specific input
+shapes, which can significantly improve execution speed.
 
-- c10::SymInt/SymFloat/SymBool: user-visible classes that simulate int/float/bool.
-- c10::SymNode/SymNodeImpl: analogous to SymNode
-- There is no ShapeEnv in C++; for ease of debugging, the entire symbolic reasoning apparatus is in Python.
 
-When you write code that is traceable with `make_fx`, it must be able to deal with SymInt/SymFloat/SymBool flowing through it. [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng) gives some guidance for how to do this.
+```{code-cell}
+import torch
+@torch.compile(dynamic=True)
+def f(x):
+    if x.size()[0] == 10:
+        return x * 10
 
-## DimDynamic policy
+    if x.size()[0] <= 30:
+        return x*200
 
-Symbolic reasoning:
+    return x*x.size()[0]
+
+f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+f(torch.rand(50))
+```
 
-- Value ranges
-- Sympy usage notes
-- Constraints
-- DimDynamic/Constraint
+In the code above, we specialize that the graph requires an input size of 10, in which
+case it will return `x * 10`. If the input size is less than 30, it will return `x * 200`.
+In the output, you can see that this creates three graphs.
 
-## Unbacked SymInts
+See the corresponding <a href="_static/img/dynamic_shapes/tlparse3_specialization.png" target="_blank">tlparse output</a>
+
+
+This is how graphs created for the above function:
+
+```{image} _static/img/dynamic_shapes/dynamic_shapes_example_specialization.png
+```
 
-To resolve control flow, we check the hint, aka actual value, of a symbolic integer to determine which branch to go. However, in some cases, we may not have a hint: so-called unbacked symbolic integers arise when a size variable emerges from a data-dependent operation like `.nonzero()` or `.item()`. It is illegal to perform control flow on these symbolic integers, so we must graph break on these operations.
+(enable-dynamic-behavior)=
+## Enabling Dynamic Behavior
 
-Naively implemented, this is too restrictive: most PyTorch programs will immediately fail if you try to do anything with unbacked symbolic integers. Here are the most important enhancements to make this actually work:
+There are the following ways to make things dynamic:
 
-- On tensor creation, PyTorch precomputes a lot of data about a tensor; for example, if you use `empty_strided` to create a tensor, we will eagerly sort the strides and determine if the tensor is non-overlapping and dense. Sorts produce a lot of guards. However, it is more common to produce a tensor directly with a higher-level API like `empty`, which is guaranteed to produce a non-overlapping and dense tensor. We modified PyTorch to avoid needlessly recomputing these properties.
-- Even if nontrivial compute is needed, sometimes a property is never actually queried at all. Making these precomputed properties lazy allows us to avoid guarding on an unbacked symbolic integer unless it is actually needed.
-- The data in an integer tensor is generally not known to be non-negative. However, we provide an API `constrain_range` whereby a user can specify that a size is bounded above and below by known limits.
+* {ref}`automatic_dynamic`
+* {ref}`user_annotations` (preferred)
+* {ref}`torch_compile_dynamic_true` (for testing only)
+* {ref}`dynamic_shapes_advanced_control_options` (for advanced use cases)
 
-Similar to the dynamic APIs, there are corresponding unbacked APIs: namely you can use mark_unbacked instead of `mark_dynamic` and `TORCH_COMPILE_UNBACKED_SOURCES` instead of `TORCH_COMPILE_DYNAMIC_SOURCES` to tell the compiler to mark an input as unbacked.
+Read below about each of this options.
 
-In future versions of PT2 (beyond PT2.1), we will extend our reasoning system
-to infer that an unbacked symbolic integer is size-like based on usage. For
-example, if you pass the result of an `.item()` call to a factory function
-like `torch.empty`, we will automatically infer that the result is a size
-(because if it was not, it would fail.) This assumption would get validated
-at runtime, raising an error if it was not fulfilled.
+(automatic_dynamic)=
+### Automatic dynamic
+
+**Automatic dynamic** is the default behavior where {func}`torch.compile` performs
+the initial compilation assuming static shapes are used, while tracking the
+input sizes from that first compilation. When a recompile is triggered, it
+uses this information to identify which dimensions have changed and marks
+those as dynamic for the second compilation.
+
+(user_annotations)=
+### User Annotations
+
+Several APIs allow users to explicitly mark specific inputs
+by name or code as dynamic. This is useful for avoiding initial compilations that
+would eventually become dynamic with the previous tools. It is also used to mark
+elements that do not automatically get marked as dynamic, such as neural network
+module parameters, and so on. User annotations are the preferred way to enable
+dynamic shapes.
+
+#### `mark_dynamic(tensor, dim, min=min, max=max)`
+
+The {func}`torch._dynamo.mark_dynamic` function marks a tensor dimension as dynamic and will fail if it
+gets specialized. It does not work for integers. Use this function only if you know
+all graphs in the frame using this input converge to a single dynamic graph.
+Otherwise, you may encounter a misleading constraint violation error.
+In such cases, consider using {func}`torch._dynamo.maybe_mark_dynamic`. Currently,
+{func}`torch._dynamo.mark_dynamic`
+does not have precedence over `force_parameter_static_shapes = True` or `force_nn_module_property_static_shapes = True`.
+
+If you know in advance that a particular dimension will be dynamic, you
+can avoid the initial recompilation by using {func}`torch._dynamo.mark_dynamic(tensor, dim)`.
+Additionally, if you already know the minimum and maximum possible
+values for this dimension, you can specify them with
+{func}`torch._dynamo.mark_dynamic(tensor, dim, min=min, max=max)`.
+
+Here is a quick example:
+
+```{code-cell}
+import torch
+
+@torch.compile(dynamic=True)
+def f(x):
+    return x * x.size()[0]
+
+x = torch.randn(10)
+torch._dynamo.mark_dynamic(x, 0)
+
+# first invocation we give it is a tensor marked as dynamic
+f(x)
+# rest of these invocations will use dynamically compiled code
+f(torch.randn(20))
+f(torch.randn(30))
+f(torch.randn(40))
+```
+
+#### `maybe_mark_dynamic(tensor, dim)`
+
+The {func}`torch._dynamo.maybe_mark_dynamic` function shares all properties
+with  {func}`torch._dynamo.mark_dynamic`
+but does not fail if the size gets specialized. Use it for inputs shared by
+multiple graphs or if the number of graphs does not converge to one for a specific
+frame. For instance, in the example above, use {func}`torch._dynamo.maybe_mark_dynamic()` because graphs
+with sizes 0 and 1 will specialize. However, you can use {func}`torch._dynamo.mark_dynamic` to ensure
+you never specialize.
+
+#### `mark_unbacked(tensor, dim)`
+
+The {func}`torch._dynamo.mark_unbacked` function marks a tensor dimension as unbacked. It is unlikely
+to be the tool you need, but it could be useful if the specialization occurs inside
+a condition `guard_size_oblivious(x)`, and if using it removes the specialization.
+Ensure it fixes the specialization and does not introduce a data-dependent error
+that converts to a graph break at or before the specialization location
+you are trying to  avoid. It might be better to use the next option.
+
+(dynamic_sources_allow_list)=
+#### Dynamic Allow List (`DYNAMIC_SOURCES`)
+
+Use the evnironmental variable `TORCH_COMPILE_DYNAMIC_SOURCES` to pass a configuration
+list of source names to be marked as dynamic. For example:
+`TORCH_COMPILE_DYNAMIC_SOURCES=L[‘x’],L[‘y’]`
+It's easiest to find these dynamic source names using the PGO artifact in `tlparse`.
+You can copy and paste the dynamic source names from the PGO artifact. This method works
+for integers and tensor sizes and has the highest precedence over all other flags
+that force static shapes. It will not throw an error if what is marked dynamic
+gets specialized or if the provided input does not exist.
+
+Here is an example:
+
+```{code-cell}
+import torch
+
+@torch.compile()
+def f(x):
+     return x * x.size()[0]
+
+with torch.compiler.config.patch(dynamic_sources="L['x']"):
+    f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+```
+
+(torch.compiler.set_stance_eager_then_compile)=
+#### `torch.compiler.set_stance ("eager_then_compile")`
+
+At times, identifying the appropriate inputs to mark as dynamic can
+be challenging. If you are willing to accept a performance cost for
+the first batch, another convenient option is to use the
+`eager_then_compile` stances, which automatically determine dynamic
+inputs for you. For more information, see {func}`torch.compiler.set_stance` and [Dynamic Compilation Control with torch.compiler.set_stance](https://docs.pytorch.org/tutorials/recipes/torch_compiler_set_stance_tutorial.html).
+
+(torch_compile_dynamic_true)=
+### `torch.compile (dynamic=true)` (Not recommended)
+
+This setting forces all sizes and integers to be dynamic, increasing the
+chance of encountering dynamic shape bugs. Setting this option is not
+recommended due to it  being error prone.
+It would make every input size dynamic which may result it performance
+regressions and ultimately increase compilation time.
+
+PyTorch also provides advanced control options for dynamic shapes, see:
+{ref}`dynamic_shapes_advanced_control_options`.
+
+## Where Do I Go From Here?
+
+If you encounter a framework code bug or an issue with specialization,
+file an issue so it can be reviewed and potentially improved. If the issue
+is within your user code, consider whether you are willing to rewrite your
+code to avoid it. Determine if it affects correctness or if it's a redundant
+check. If the issue involves a Triton custom kernel with a `constexpr`
+argument, evaluate whether you can rewrite it to address the problem.
+
+```{toctree}
+:maxdepth: 1
+compile/dynamic_shapes_core_concepts
+compile/dynamic_shapes_troubleshooting
+compile/dynamic_shapes_advanced_control_options
+compile/dynamic_shapes_beyond_the_basics
+```
+
+```{seealso}
+* [tlparse documentation](https://github.com/pytorch/tlparse)
+* [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit?tab=t.0#heading=h.fh8zzonyw8ng)
+```
diff --git a/docs/source/torch.compiler_inductor_provenance.rst b/docs/source/torch.compiler_inductor_provenance.rst
index ccdafbe3fd2ee..f20dfb40b2066 100644
--- a/docs/source/torch.compiler_inductor_provenance.rst
+++ b/docs/source/torch.compiler_inductor_provenance.rst
@@ -3,12 +3,6 @@
 TorchInductor and AOTInductor Provenance Tracking
 =================================================
 
-.. warning::
-    This feature is a prototype under active development and there will be
-    breaking change in future releases.
-    The current compatibility of this tool is limited to the latest nightly build of PyTorch.
-
-
 This section describes how to use the provenance tracking feature for TorchInductor and AOTInductor in ``tlparse``.
 Provenance tracking helps you visualize the relationships between the input GraphModule to (AOT)Inductor and the optimized code generated. This feature allows you to trace how your original operations are transformed during compilation.
 
@@ -37,7 +31,7 @@ Follow these steps to enable and use provenance tracking in your PyTorch project
 
    .. code-block:: bash
 
-     TORCH_TRACE=~/my_trace_log_dir TORCH_LOGS="+inductor" TORCH_COMPILE_DEBUG=1 python your_program.py
+     TORCH_TRACE=~/my_trace_log_dir INDUCTOR_PROVENANCE=1 python your_program.py
 
    This will generate a log file in ``~/my_trace_log_dir``. The log file will be used by tlparse to generate the provenance tracking highlighter.
 3. Run ``tlparse`` on the log with ``--inductor-provenance`` flag. For example:
@@ -62,6 +56,24 @@ For a demo, see: https://github.com/pytorch/tlparse/pull/93
  .. image:: _static/img/inductor_provenance/index.png
 
 
+Source code corresponding to each Inductor kernel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With ``INDUCTOR_PROVENANCE=1``, you can also view the source code corresponding to each Inductor kernel in tlparse. To access it, click the "readable_html" link next to "inductor_provenance_tracking_kernel_stack_traces.json" in the tlparse output.
+
+ .. image:: _static/img/inductor_provenance/index_2.png
+
+
+Below are some example screenshots. The ``:1`` and ``:467`` suffixes at the end of the kernel names are used to distinguish different calls to the same kernel. We refer to these suffixes as debug handles.
+
+ .. image:: _static/img/inductor_provenance/kernel_source_1.png
+ .. image:: _static/img/inductor_provenance/kernel_source_2.png
+
+You can also find the debug handle in the comments within the kernel source code.
+
+ .. image:: _static/img/inductor_provenance/kernel_source_3.png
+
+
 See Also
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/xpu.aliases.md b/docs/source/xpu.aliases.md
new file mode 100644
index 0000000000000..ebec51d37452c
--- /dev/null
+++ b/docs/source/xpu.aliases.md
@@ -0,0 +1,32 @@
+# Aliases in torch.xpu
+
+The following are aliases to their counterparts in ``torch.xpu`` in the nested namespaces in which they are defined. For any of these APIs, feel free to use the top-level version in ``torch.xpu`` like ``torch.xpu.seed`` or the nested version ``torch.xpu.random.seed``.
+
+```{eval-rst}
+.. automodule:: torch.xpu.random
+.. currentmodule:: torch.xpu.random
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    get_rng_state
+    get_rng_state_all
+    initial_seed
+    manual_seed
+    manual_seed_all
+    seed
+    seed_all
+    set_rng_state
+    set_rng_state_all
+```
+
+```{eval-rst}
+.. automodule:: torch.xpu.streams
+.. currentmodule:: torch.xpu.streams
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Event
+    Stream
+```
\ No newline at end of file
diff --git a/docs/source/xpu.md b/docs/source/xpu.md
index 46d36451d4b8a..2018bc6c994f6 100644
--- a/docs/source/xpu.md
+++ b/docs/source/xpu.md
@@ -12,6 +12,7 @@
     :nosignatures:
 
     StreamContext
+    can_device_access_peer
     current_device
     current_stream
     device
@@ -25,6 +26,7 @@
     get_stream_from_external
     init
     is_available
+    is_bf16_supported
     is_initialized
     set_device
     set_stream
@@ -84,9 +86,9 @@
      reset_peak_memory_stats
 ```
 
-<!-- This module needs to be documented. Adding here in the meantime
-for tracking purposes -->
 ```{eval-rst}
-.. py:module:: torch.xpu.random
-.. py:module:: torch.xpu.streams
-```
+.. toctree::
+    :hidden:
+
+    xpu.aliases.md
+```
\ No newline at end of file
diff --git a/functorch/.gitignore b/functorch/.gitignore
index 145ab7d608390..58bffff1353d6 100644
--- a/functorch/.gitignore
+++ b/functorch/.gitignore
@@ -3,7 +3,6 @@ dist/
 functorch.egg-info/
 *__pycache__*
 functorch/version.py
-functorch/_C.so
 .gdbinit
 t.py
 .vscode/
diff --git a/functorch/CMakeLists.txt b/functorch/CMakeLists.txt
deleted file mode 100644
index bdfa4bfe4550d..0000000000000
--- a/functorch/CMakeLists.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-project(functorch)
-set(CMAKE_CXX_STANDARD 17)
-
-include(GNUInstallDirs)
-include(CMakePackageConfigHelpers)
-
-set(FT_DIR csrc)
-file(GLOB_RECURSE FT_SOURCES ${FT_DIR}/*.cpp ${FT_DIR}/*.c)
-
-add_library(${PROJECT_NAME} MODULE ${FT_SOURCES})
-target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-target_compile_definitions(${PROJECT_NAME} PRIVATE FUNCTORCH_BUILD_MAIN_LIB)
-target_compile_definitions(${PROJECT_NAME} PRIVATE TORCH_EXTENSION_NAME=_C)
-target_compile_definitions(${PROJECT_NAME} PRIVATE TORCH_API_INCLUDE_EXTENSION_H)
-target_compile_options(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
-target_compile_options_if_supported(${PROJECT_NAME} "-Wmissing-prototypes")
-target_compile_options_if_supported(${PROJECT_NAME} "-Werror=missing-prototypes")
-if(BUILD_LIBTORCHLESS)
-  target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIB} torch_python)
-else()
-  # functorch cannot use the alias to build on windows
-  target_link_libraries(${PROJECT_NAME} PRIVATE torch torch_python)
-endif()
-target_link_libraries(${PROJECT_NAME} PRIVATE pybind::pybind11)
-
-set_target_properties(${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-      ${CMAKE_BINARY_DIR}/functorch)
-set_target_properties(${PROJECT_NAME} PROPERTIES INSTALL_RPATH "${_rpath_portable_origin}/../torch/lib")
-
-# Copy-pasted prefix/suffix logic for Python extensions from
-# https://github.com/pytorch/pytorch/blob/33bb8ae350611760139457b85842b1d7edf9aa11/caffe2/CMakeLists.txt#L1975
-# https://github.com/pytorch/pytorch/blob/33bb8ae350611760139457b85842b1d7edf9aa11/caffe2/CMakeLists.txt#L2022
-# TODO: It would be good to be able to use Python3_add_library target, but it does not work in many cases
-set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" DEBUG_POSTFIX "")
-if(WIN32)
-  set_target_properties(${PROJECT_NAME} PROPERTIES SUFFIX ".pyd")
-else()
-  set_target_properties(${PROJECT_NAME} PROPERTIES SUFFIX ".so")
-endif()
-# Needed to link functorch on MacOS
-if(NOT ${TORCH_PYTHON_LINK_FLAGS} STREQUAL "")
-  set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
-endif()
-install(TARGETS ${PROJECT_NAME} DESTINATION "${CMAKE_CURRENT_SOURCE_DIR}")
diff --git a/functorch/COMPILE_README.md b/functorch/COMPILE_README.md
index 964cda6fbec0e..47ca484e7c07e 100644
--- a/functorch/COMPILE_README.md
+++ b/functorch/COMPILE_README.md
@@ -72,4 +72,4 @@ aot_function(f, ts_compiler, ts_compiler)(torch.randn(3, requires_grad=True))
 * Min-cut [recomputation](https://dev-discuss.pytorch.org/t/min-cut-optimal-recomputation-i-e-activation-checkpointing-with-aotautograd/467) with AOT Autograd.
 
 ## Tutorials
-You can use this [tutorial](https://pytorch.org/functorch/nightly/notebooks/aot_autograd_optimizations.html) to play with AOT Autograd.
+You can use this [tutorial](https://pytorch.org/functorch/nightly/tutorials/aot_autograd_optimizations.html) to play with AOT Autograd.
diff --git a/functorch/csrc/dim/arena.h b/functorch/csrc/dim/arena.h
deleted file mode 100644
index ec2cfef668952..0000000000000
--- a/functorch/csrc/dim/arena.h
+++ /dev/null
@@ -1,332 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-#include <ATen/ATen.h>
-#include "minpybind.h"
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#include <intrin.h>
-// https://stackoverflow.com/questions/355967/how-to-use-msvc-intrinsics-to-get-the-equivalent-of-this-gcc-code
-inline unsigned int __builtin_clz(unsigned int x) {
-    unsigned long r = 0;
-    _BitScanReverse(&r, x);
-    return (31 - r);
-}
-#endif
-
-inline int round2min8(int num) {
-   int nzeros = __builtin_clz((num - 1)|4);
-   return 1 << (32 - nzeros);
-}
-
-struct Arena;
-template<typename T>
-struct OwnedSlice;
-
-template<typename T>
-struct Slice {
-    Slice()
-    :  begin_(nullptr), size_(0), capacity_(0) {}
-
-    template<typename... Args>
-    Slice(Arena& arena, Args&&... args);
-
-    T* begin() const {
-        return begin_;
-    }
-    T* end() const {
-        return begin_ + size_;
-    }
-    int size() const {
-        return size_;
-    }
-    int capacity() const {
-        return capacity_;
-    }
-
-    T& back(int i=-1) {
-        return begin_[size_ + i];
-    }
-
-    T& operator[](int i) const {
-        return begin_[i];
-    }
-    std::optional<int> index(const T& value) {
-        for (int i : enumerate()) {
-            if (begin_[i] == value) {
-                return i;
-            }
-        }
-        return std::nullopt;
-    }
-    bool contains(const T& value) {
-        return index(value).has_value();
-    }
-
-    void insert(Arena& arena, Slice where, Slice to_insert);
-    void insert(Arena& arena, Slice where, T v) {
-        return insert(arena, where, Slice(&v, &v + 1));
-    }
-    void insert(Arena& arena, int where, T v) {
-        return insert(arena, slice(where, where), v);
-    }
-    void append(Arena& arena, T value);
-    void extend(Arena& arena, Slice to_insert);
-    void extend(Arena& arena, const T* begin, const T* end) {
-        return extend(arena, Slice<T>((T*)begin, (T*)end));
-    }
-
-    bool remove(Arena& A, T value) {
-        auto idx = index(value);
-        if (idx) {
-            insert(A, slice(*idx, *idx + 1), Slice());
-        }
-        return idx.has_value();
-    }
-
-    Slice slice(int begin) {
-        return slice(begin, size_);
-    }
-
-    Slice slice(int begin, int end) {
-        if (begin < 0) {
-            begin += size_;
-        }
-        if (end < 0) {
-            end += size_;
-        }
-        Slice result;
-        result.begin_ = begin_ + begin;
-        result.size_ = end - begin;
-        result.capacity_ = result.size_;
-        return result;
-    }
-
-    bool inside(Slice where) {
-        return begin() <= where.begin() && where.end() <= end();
-    }
-
-    irange enumerate() const {
-        return irange(size_);
-    }
-
-    irange reversed_enumerate() const {
-        return irange(size_ - 1, -1, -1);
-    }
-
-    bool operator==(const Slice<T>& rhs) const {
-        if (size() != rhs.size()) {
-            return false;
-        }
-        return std::equal(begin(), end(), rhs.begin());
-    }
-
-    Slice(T* begin, T* end)
-    : begin_(begin), size_(end - begin), capacity_(size_) {}
-
-protected:
-    static int _length(const T& t) {
-        return 1;
-    }
-    static int _length(Slice t) {
-        return t.size_;
-    }
-    static T* _insert(T*& dst, T t) {
-        *dst = std::move(t);
-        return ++dst;
-    }
-    static T* _insert(T*& dst, Slice t) {
-        std::memcpy(dst, t.begin_, sizeof(T)*t.size_);
-        dst += t.size_;
-        return dst;
-    }
-    T* begin_;
-    int size_;
-    int capacity_;
-    friend struct OwnedSlice<T>;
-};
-
-template<typename T>
-struct OwnedSlice {
-    typedef void (*deleter_t)(Slice<T>);
-    static void _no_delete(Slice<T>) {}
-    OwnedSlice()
-    : deleter_(_no_delete) {}
-    OwnedSlice(const OwnedSlice&) = delete;
-    OwnedSlice& operator=(const OwnedSlice&) = delete;
-    ~OwnedSlice() {
-        deleter_(slice_);
-        if (slice_.size_ > 8) {
-            delete [] slice_.begin_;
-        }
-    }
-    void set(Slice<T> to_own, deleter_t deleter = _no_delete) {
-        slice_.size_ = slice_.capacity_ = to_own.size();
-        slice_.begin_ = (slice_.size_ > 8) ? new T[slice_.size_] : &small_buf[0];
-        std::memcpy(slice_.begin_, to_own.begin(), slice_.size_ * sizeof(T));
-        deleter_ = deleter;
-    }
-    Slice<T> slice() const {
-        return slice_;
-    }
-private:
-    Slice<T> slice_;
-    deleter_t deleter_;
-    T small_buf[8];
-};
-
-template<typename T>
-inline std::ostream& operator<<(std::ostream& s, const Slice<T>& v) {
-    s << "[";
-    for (int i : v.enumerate()) {
-        if (i > 0) {
-            s << ", ";
-        }
-        s << v[i];
-    }
-    s << "]";
-    return s;
-}
-
-struct TensorRef {
-    TensorRef()
-    : impl_(nullptr){}
-    TensorRef(const at::Tensor& t)
-    : impl_(t.unsafeGetTensorImpl()) {}
-    const at::Tensor& operator*() const {
-        return *(at::Tensor*)this;
-    }
-    at::Tensor* operator->() const {
-        return (at::Tensor*)this;
-    }
-    operator bool() const {
-        return impl_ != nullptr;
-    }
-private:
-    at::TensorImpl* impl_;
-};
-
-constexpr int ARENA_MAX_SIZE = 4096;
-constexpr int ALIGNMENT = 8;
-struct Arena {
-    Arena()
-    : allocated_(0) {}
-    template<typename T>
-    T* allocate(int n) {
-        if (!n) {
-            return nullptr;
-        }
-        int to_allocate = sizeof(T)*n;
-        int to_allocate_rounded = ALIGNMENT * ((to_allocate - 1) / ALIGNMENT + 1);
-        auto prev_allocated = allocated_;
-        allocated_ += to_allocate_rounded;
-        if (C10_UNLIKELY_OR_CONST(allocated_ > ARENA_MAX_SIZE)) {
-            overflow_.emplace_back(new char[to_allocate]);
-            return (T*) &overflow_.back()[0];
-        }
-        return (T*) (buffer_ + prev_allocated);
-    }
-    TensorRef autorelease(at::Tensor s) {
-        auto ref = TensorRef(s);
-        s.unsafeReleaseTensorImpl();
-        ar_tensors_.append(*this, ref);
-        return ref;
-    }
-    mpy::handle autorelease(mpy::object obj) {
-        ar_objects_.append(*this, obj);
-        obj.release();
-        return ar_objects_.back();
-    }
-    ~Arena() {
-        for(TensorRef t: ar_tensors_) {
-            c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(t->unsafeGetTensorImpl());
-        }
-        for(mpy::handle h: ar_objects_) {
-            mpy::object::steal(h);
-        }
-    }
-private:
-    int64_t allocated_;
-    char buffer_[ARENA_MAX_SIZE];
-    Slice<TensorRef> ar_tensors_;
-    Slice<mpy::handle> ar_objects_;
-    std::vector<std::unique_ptr<char[]>> overflow_;
-};
-
-template<typename T>
-inline void Slice<T>::insert(Arena& arena, Slice where, Slice to_insert) {
-    AT_ASSERT(inside(where));
-    Slice result = *this;
-    /// b------sb---se-----e,  0----n
-    T* body_dest = where.begin();
-    if (where.size() != to_insert.size()) {
-        int new_size = size() - where.size() + to_insert.size();
-        T* tail_dest = where.begin() + to_insert.size();
-        if (new_size >= capacity_) {
-            int new_capacity = new_size ? round2min8(new_size) : 0;
-            result.capacity_ = new_capacity;
-            result.begin_ = arena.allocate<T>(new_capacity);
-            body_dest = result.begin_ + (where.begin() - begin());
-            tail_dest = body_dest + to_insert.size();
-            //std::memcpy(result.begin_, begin_, sizeof(T)*(where.begin() - begin()));
-            std::copy(begin_, begin_ + (where.begin() - begin()), result.begin_);
-        }
-        std::memmove(tail_dest, where.end(), sizeof(T)*(end() - where.end()));
-        result.size_ = new_size;
-    }
-
-    //std::memcpy(body_dest, to_insert.begin(), sizeof(T)*to_insert.size());
-    std::copy(to_insert.begin(), to_insert.end(), body_dest);
-    *this = result;
-}
-
-template<typename T>
-inline void Slice<T>::append(Arena& arena, T value) {
-    Slice result = *this;
-    if (size_ == capacity_) {
-        int new_size = size_ ? round2min8(size_)*2 : 8;
-        T* n = arena.allocate<T>(new_size);
-        //memcpy(n, begin_, size_*sizeof(T));
-        std::copy(begin_, begin_ + size_, n);
-        result.begin_ = n;
-        result.capacity_ = new_size;
-    }
-    result[result.size_++] = std::move(value);
-    *this = result;
-}
-
-template<typename T>
-inline void Slice<T>::extend(Arena& arena, Slice<T> rhs) {
-    Slice result = *this;
-    result.size_ = size_ + rhs.size();
-    if (result.size_ > capacity_) {
-        int new_size = round2min8(result.size_);
-        T* n = arena.allocate<T>(new_size);
-        //memcpy(n, begin_, size_*sizeof(T));
-        std::copy(begin_, begin_+size_, n);
-        result.begin_ = n;
-        result.capacity_ = new_size;
-    }
-    //memcpy(result.begin_ + size_, rhs.begin(), sizeof(T)*rhs.size());
-    std::copy(rhs.begin(), rhs.end(), result.begin_ + size_);
-    *this = result;
-}
-
-template<typename T>
-template<typename... Args>
-Slice<T>::Slice(Arena& arena, Args&&... args) {
-    int lens[] = {_length(args)...};
-    size_ = 0;
-    for (auto i : lens) {
-        size_ += i;
-    }
-    capacity_ = size_ ? round2min8(size_) : 0;
-    begin_ = arena.allocate<T>(capacity_);
-    T* dst_ = begin_;
-    T* unused[] = {_insert(dst_, args)...};
-    (void) unused;
-}
diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
deleted file mode 100644
index 8f1e561e2051b..0000000000000
--- a/functorch/csrc/dim/dim.cpp
+++ /dev/null
@@ -1,3657 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <torch/csrc/utils/python_compat.h>
-
-// Many APIs have changed/don't exist anymore
-#if IS_PYTHON_3_12_PLUS
-
-#include "dim.h"
-
-// Re-enable this some day
-PyObject* Dim_init() {
-  PyErr_SetString(
-      PyExc_RuntimeError, "First class dim doesn't work with python 3.12");
-  return nullptr;
-}
-
-#else
-
-#include <frameobject.h>
-#include <opcode.h>
-#include <iostream>
-#include <new>
-#include <utility>
-#include <vector>
-#include "minpybind.h"
-// #include <torch/csrc/autograd/python_variable.h>
-#include <ATen/ATen.h>
-#include <ATen/functorch/BatchedTensorImpl.h>
-#include <ATen/functorch/DynamicLayer.h>
-#include <torch/csrc/Export.h>
-#include <memory>
-#include "arena.h"
-#include "dim.h"
-#include "python_variable_simple.h"
-
-#if IS_PYTHON_3_11_PLUS
-
-#define Py_BUILD_CORE
-#include "internal/pycore_opcode.h"
-#undef Py_BUILD_CORE
-#endif
-
-// C++ API functions for objects to
-// * construct the object, returning a ref-counted handle
-// * The actual API, with methods that take/return C-typed values
-
-// extend minpybind.h to include
-// * typed handles so that -> can get to their raw API
-// * object/handle distinction for the typed handles
-
-// class Dim: ---------------
-mpy::handle torch_Tensor___mul__;
-mpy::handle _Tensor;
-mpy::handle _Tensor_sum;
-mpy::handle NamedTuple;
-mpy::dict_view pointwise;
-mpy::handle torch_Tensor_expand;
-binaryfunc THPVariable_getitem;
-objobjargproc THPVariable_setitem;
-mpy::handle no_slice;
-PyTypeObject* torch_Tensor;
-mpy::handle torch_Tensor_copy_;
-mpy::handle torch_Tensor_split;
-bool pointwise_optimize = true;
-PyTypeObject* DimType = nullptr;
-
-PyObject* Tensor_getitem(PyObject* self, PyObject* index);
-int Tensor_setitem(PyObject* self, PyObject* index, PyObject* value);
-
-namespace {
-void maybeInitializeGlobals() {
-  // globals that depend on the python dim library,
-  // which we can't lookup until we finish initializing the _C module
-  if (_Tensor.ptr()) {
-    return;
-  }
-  auto dim = mpy::import("functorch.dim");
-  _Tensor = dim.attr("_Tensor");
-  pointwise = dim.attr("pointwise");
-  _Tensor_sum = _Tensor.attr("sum");
-  DimType = (PyTypeObject*)mpy::import("functorch.dim").attr("Dim").ptr();
-}
-
-void replaceMappingIfMatches(mpy::handle tp) {
-  auto T = (PyTypeObject*)tp.ptr();
-  bool recurse = false;
-  if (T->tp_as_mapping->mp_subscript == THPVariable_getitem) {
-    T->tp_as_mapping->mp_subscript = Tensor_getitem;
-    recurse = true;
-  }
-  if (T->tp_as_mapping->mp_ass_subscript == THPVariable_setitem) {
-    T->tp_as_mapping->mp_ass_subscript = Tensor_setitem;
-    recurse = true;
-  }
-  if (recurse) {
-    auto result = tp.attr("__subclasses__").call();
-    mpy::list_view lv(result);
-    for (auto i : lv.enumerate()) {
-      replaceMappingIfMatches(lv[i]);
-    }
-  }
-}
-
-void initializeGlobals(Arena& A) {
-  auto torch = mpy::import("torch");
-  torch_Tensor = (PyTypeObject*)torch.attr("Tensor").ptr();
-  torch_Tensor___mul__ = torch.attr("Tensor").attr("__mul__");
-
-  torch_Tensor_expand = torch.attr("_C").attr("TensorBase").attr("expand");
-  torch_Tensor_split = torch.attr("_C").attr("TensorBase").attr("split");
-  torch_Tensor_copy_ = torch.attr("Tensor").attr("copy_");
-  auto py_TensorBase = torch.attr("_C").attr("TensorBase");
-  auto TensorBase = (PyTypeObject*)py_TensorBase.ptr();
-  THPVariable_getitem = TensorBase->tp_as_mapping->mp_subscript;
-  THPVariable_setitem = TensorBase->tp_as_mapping->mp_ass_subscript;
-  NamedTuple = mpy::import("typing").attr("NamedTuple");
-  no_slice = PySlice_New(NULL, NULL, NULL);
-}
-
-mpy::handle DimensionBindError_;
-mpy::handle DimensionBindError() {
-  if (!DimensionBindError_.ptr()) {
-    DimensionBindError_ =
-        mpy::import("functorch.dim").attr("DimensionBindError");
-  }
-  return DimensionBindError_;
-}
-
-static int64_t n_dims_created = 65;
-
-struct Dim : public mpy::base<Dim> {
-  int64_t level_; // for stable comparisons in prototype
-  mpy::object name_;
-  Dim() : level_(n_dims_created++) {}
-  void init(mpy::object name, int64_t s = -1) {
-    name_ = std::move(name);
-    size_ = s;
-  }
-
-  static bool check_exact(mpy::handle v) {
-    return Py_TYPE(v.ptr()) == DimType;
-  }
-
-  int64_t size() const {
-    if (size_ == -1) {
-      mpy::raise_error(
-          PyExc_ValueError, "dimension %S is unbound", name_.ptr());
-    }
-    return size_;
-  }
-  void set_size(int64_t v) {
-    if (size_ == -1) {
-      size_ = v;
-    } else if (size_ != v) {
-      mpy::raise_error(
-          DimensionBindError(),
-          "Dim '%R' previously bound to a dimension of size %lld cannot bind to a dimension of size %lld",
-          this,
-          this->size_,
-          v);
-    }
-  }
-  bool is_bound() const {
-    return size_ != -1;
-  }
-  static mpy::obj<Dim> create(mpy::object name, int64_t s = -1) {
-    if (!DimType) {
-      maybeInitializeGlobals();
-    }
-    auto r = Dim::alloc(DimType);
-    r->init(std::move(name), s);
-    return r;
-  }
-  static PyTypeObject Type;
-  const at::Tensor& range() {
-    if (!range_.defined()) {
-      range_ = at::arange(size());
-    }
-    return range_;
-  }
-  const at::Tensor& batchtensor() {
-    if (!batchtensor_.defined()) {
-      batchtensor_ = at::functorch::addBatchDim(range(), 0, level_);
-    }
-    return batchtensor_;
-  }
-
- private:
-  int64_t size_{-1};
-  at::Tensor range_;
-  at::Tensor batchtensor_;
-};
-
-struct DimEntry {
-  // union of either a negative number indicating which dimension this is from
-  // the rhs, or a pointer to a first-class dimension. pointers do not have
-  // their highest bit set, so checking the number is negative tells us that it
-  // is not a dim.
-  bool is_positional() const {
-    return data_ < 0;
-  }
-  bool is_none() const {
-    return data_ == 0;
-  }
-  int64_t position() const {
-    return data_;
-  }
-  mpy::hdl<Dim> dim() const {
-    Dim* result;
-    std::memcpy(&result, &data_, sizeof(Dim*));
-    return mpy::hdl<Dim>(result);
-  }
-
-  DimEntry() : data_(0) {}
-
-  DimEntry(int64_t pos) : data_(pos) {
-    AT_ASSERT(pos < 0);
-  }
-  DimEntry(mpy::hdl<Dim> d) {
-    std::memcpy(&data_, &d, sizeof(int64_t));
-  }
-  bool operator==(const DimEntry& rhs) const {
-    return data_ == rhs.data_;
-  }
-
- private:
-  int64_t data_;
-};
-
-// Dim wrapper methods
-DimEntry _wrap_dim(mpy::handle d, size_t N, bool keepdim) {
-  if (Dim::check(d)) {
-    if (keepdim) {
-      mpy::raise_error(
-          PyExc_ValueError,
-          "cannot preserve first-class dimensions with keepdim=True");
-    }
-    return Dim::unchecked_wrap(d);
-  } else if (mpy::is_int(d)) {
-    auto i = mpy::to_int(d);
-    while (i >= 0) {
-      i -= N;
-    }
-    return i;
-  } else {
-    return DimEntry();
-  }
-}
-
-int Dim_init(mpy::hdl<Dim> self, PyObject* args, PyObject* kwds) {
-  PY_BEGIN
-  static constexpr const char* kwlist[] = {"name", "size", nullptr};
-  mpy::handle name;
-  mpy::handle size = nullptr;
-  if (!PyArg_ParseTupleAndKeywords(
-          args, kwds, "O|O", const_cast<char**>(kwlist), &name, &size)) {
-    return -1;
-  }
-  self->init(
-      mpy::object::borrow(name),
-      (size.ptr() && !mpy::is_none(size)) ? mpy::to_int(size) : -1);
-  return 0;
-  PY_END(-1)
-}
-
-PyObject* Dim_repr(Dim* self) {
-  PY_BEGIN
-  mpy::object name = (self->name_.ptr())
-      ? self->name_
-      : mpy::unicode_from_string("<uninitialized dim>");
-  return name.release();
-  PY_END(nullptr)
-}
-
-PyObject* Dim_getsize(Dim* self, void*) {
-  PY_BEGIN
-  return mpy::from_int(self->size()).release();
-  PY_END(nullptr)
-}
-
-int Dim_setsize(Dim* self, PyObject* size, void*) {
-  PY_BEGIN
-  self->set_size(mpy::to_int(size));
-  return 0;
-  PY_END(-1)
-}
-
-PyObject* Dim_getis_bound(Dim* self, void*) {
-  return PyBool_FromLong(self->is_bound());
-}
-
-PyObject* Dim_getlevel(Dim* self, void*) {
-  return PyLong_FromLong(self->level_);
-}
-
-PyObject* Dim_get_levels(Dim* self, void*) {
-  mpy::tuple t(1);
-  t.set(0, mpy::object::borrow(self->ptr()));
-  return t.release();
-}
-
-PyObject* Dim_get_has_device(Dim* self, void*) {
-  Py_RETURN_FALSE;
-}
-
-PyObject* Dim_get_tensor(Dim* self, void*) {
-  return THPVariable_Wrap(self->range());
-}
-
-PyObject* Dim_get_batchtensor(Dim* self, void*) {
-  return THPVariable_Wrap(self->batchtensor());
-}
-
-PyGetSetDef Dim_getsetters[] = {
-    {"size", (getter)Dim_getsize, (setter)Dim_setsize, "Dimension size", NULL},
-    {"is_bound", (getter)Dim_getis_bound, NULL, "is_bound", NULL},
-    {"_level", (getter)Dim_getlevel, NULL, "_level", NULL},
-    {"_levels", (getter)Dim_get_levels, NULL, "_levels", NULL},
-    {"_has_device", (getter)Dim_get_has_device, NULL, "_has_device", NULL},
-    {"_tensor", (getter)Dim_get_tensor, NULL, "_tensor", NULL},
-    {"_batchtensor", (getter)Dim_get_batchtensor, NULL, "_batchtensor", NULL},
-    {"ndim",
-     (getter)[](PyObject* self, void*)
-         ->PyObject* {return mpy::from_int(1).release();
-} // namespace
-, NULL, "ndim", NULL
-}
-, {
-  NULL
-} /* Sentinel */
-}
-;
-}
-PyTypeObject Dim::Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    "_C.Dim", /* tp_name */
-    sizeof(Dim), /* tp_basicsize */
-    0, /* tp_itemsize */
-    Dim::dealloc_stub, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    0, /* tp_getattr */
-    0, /* tp_setattr */
-    0, /* tp_as_async */
-    (reprfunc)Dim_repr, /* tp_repr */
-    0, /* tp_as_number */
-    0, /* tp_as_sequence */
-    0, /* tp_as_mapping */
-    0, /* tp_hash */
-    0, /* tp_call */
-    0, /* tp_str */
-    0, /* tp_getattro */
-    0, /* tp_setattro */
-    0, /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
-    "Dim Object", /* tp_doc */
-    0, /* tp_traverse */
-    0, /* tp_clear */
-    0, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    0, /* tp_iter */
-    0, /* tp_iternext */
-    0, /* tp_methods */
-    0, /* tp_members */
-    Dim_getsetters, /* tp_getset */
-    0, /* tp_base */
-    0, /* tp_dict */
-    0, /* tp_descr_get */
-    0, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    (initproc)(void*)static_cast<int (*)(mpy::hdl<Dim>, PyObject*, PyObject*)>(
-        Dim_init), /* tp_init */
-    0, /* tp_alloc */
-    Dim::new_stub, /* tp_new */
-};
-
-// class DimList ------------
-
-struct DimList : public mpy::base<DimList> {
-  mpy::object name_;
-  std::vector<mpy::obj<Dim>> dims_;
-  static PyTypeObject Type;
-  void init(mpy::object name) {
-    name_ = std::move(name);
-  }
-  void set_dims(std::vector<mpy::obj<Dim>> dims) {
-    bound_ = true;
-    dims_ = std::move(dims);
-  }
-  bool is_bound() {
-    return bound_;
-  }
-  void bind_len(int64_t size) {
-    if (bound_) {
-      int64_t b_size = dims_.size();
-      if (b_size != size) {
-        mpy::raise_error(
-            DimensionBindError(),
-            "Dimlist has size %lld but it is being bound to size %d",
-            b_size,
-            size);
-      }
-    } else {
-      bound_ = true;
-      dims_.resize(size);
-      for (Py_ssize_t i = 0; i < size; ++i) {
-        dims_[i] =
-            Dim::create(mpy::unicode_from_format("%S%i", name_.ptr(), (int)i));
-      }
-    }
-  }
-  int64_t size() const {
-    if (!bound_) {
-      mpy::raise_error(DimensionBindError(), "DimList not bound");
-    }
-    return dims_.size();
-  }
-  void set_bound(bool b) {
-    bound_ = b;
-  }
-
- private:
-  bool bound_ = false;
-};
-
-static int DimList_init(DimList* self, PyObject* args, PyObject* kwds);
-
-static PyObject* DimList_repr(DimList* self) {
-  PY_BEGIN
-  if (self->is_bound()) {
-    size_t size = self->dims_.size();
-    mpy::tuple t(size);
-    for (size_t i = 0; i < size; ++i) {
-      t.set(i, self->dims_[i]);
-    }
-    return mpy::repr(t).release();
-  } else if (!mpy::is_none(self->name_)) {
-    return mpy::unicode_from_format("*%S", self->name_.ptr()).release();
-  } else {
-    return mpy::unicode_from_string("<unbound_dimlist>").release();
-  }
-  PY_END(nullptr)
-}
-
-static PyObject* DimList_bind(
-    DimList* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  mpy::handle sizes;
-  static const char* const _keywords[] = {"sizes", nullptr};
-  static _PyArg_Parser parser = {"O", _keywords, 0};
-  if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &sizes)) {
-    return nullptr;
-  }
-  if (!mpy::is_sequence(sizes)) {
-    mpy::raise_error(PyExc_ValueError, "expected a sequence");
-  }
-  mpy::sequence_view seq = sizes;
-  auto size = seq.size();
-  self->bind_len(size);
-  for (Py_ssize_t i = 0; i < size; ++i) {
-    self->dims_[i]->set_size(mpy::to_int(seq[i]));
-  }
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-static PyObject* DimList_bind_len(
-    DimList* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  int size;
-  static const char* const _keywords[] = {"N", nullptr};
-  static _PyArg_Parser parser = {"i", _keywords, 0};
-  if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &size)) {
-    return nullptr;
-  }
-  self->bind_len(size);
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-static PyMethodDef DimList_methods[] = {
-    {"bind", (PyCFunction)(void*)DimList_bind, METH_FASTCALL | METH_KEYWORDS},
-    {"bind_len",
-     (PyCFunction)(void*)DimList_bind_len,
-     METH_FASTCALL | METH_KEYWORDS},
-    {NULL, NULL, 0, NULL} /* Sentinel */
-};
-
-static Py_ssize_t DimList_len(DimList* self) {
-  PY_BEGIN
-  return self->size();
-  PY_END(-1)
-}
-
-static PyObject* DimList_item(DimList* self, Py_ssize_t idx) {
-  PY_BEGIN
-  if (!self->is_bound()) {
-    mpy::raise_error(DimensionBindError(), "DimList not bound");
-  }
-  if (idx < 0 || (size_t)idx >= self->dims_.size()) {
-    mpy::raise_error(PyExc_IndexError, "index out of bounds");
-  }
-  mpy::object r = self->dims_[idx];
-  return r.release();
-  PY_END(nullptr)
-}
-
-PySequenceMethods DimList_seq{
-    (lenfunc)DimList_len, // lenfunc sq_length;
-    0, // binaryfunc sq_concat;
-    0, // ssizeargfunc sq_repeat;
-    (ssizeargfunc)DimList_item, // ssizeargfunc sq_item;
-    0, // void *was_sq_slice;
-    0, // ssizeobjargproc sq_ass_item;
-    0, // void *was_sq_ass_slice;
-    0, // objobjproc sq_contains;
-
-    0, // binaryfunc sq_inplace_concat;
-    0, // ssizeargfunc sq_inplace_repeat;
-};
-
-static PyObject* DimList_getis_bound(DimList* self, void*) {
-  return PyBool_FromLong(self->is_bound());
-}
-
-static PyGetSetDef DimList_getsetters[] = {
-    {"is_bound", (getter)DimList_getis_bound, NULL, "is_bound", NULL},
-    {NULL} /* Sentinel */
-};
-
-static PyObject* DimList_subscript(DimList* self, mpy::handle idx) {
-  PY_BEGIN
-  if (mpy::is_int(idx)) {
-    return DimList_item(self, mpy::to_int(idx));
-  } else if (mpy::is_slice(idx)) {
-    if (!self->is_bound()) {
-      mpy::raise_error(DimensionBindError(), "DimList not bound");
-    }
-    mpy::slice_view s(idx, self->dims_.size());
-    mpy::tuple r(s.slicelength);
-    for (Py_ssize_t i = s.start, j = 0; i < s.stop; i += s.step) {
-      r.set(j++, self->dims_[i]);
-    }
-    return r.release();
-  } else {
-    mpy::raise_error(PyExc_ValueError, "expected an int or a slice");
-    return nullptr;
-  }
-  PY_END(nullptr)
-}
-
-PyMappingMethods DimList_mapping = {
-    0, // lenfunc mp_length;
-    (binaryfunc)(void*)DimList_subscript, // binaryfunc mp_subscript;
-    0, // objobjargproc mp_ass_subscript;
-};
-
-PyTypeObject DimList::Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    "_C.DimList", /* tp_name */
-    sizeof(DimList), /* tp_basicsize */
-    0, /* tp_itemsize */
-    DimList::dealloc_stub, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    0, /* tp_getattr */
-    0, /* tp_setattr */
-    0, /* tp_as_async */
-    (reprfunc)DimList_repr, /* tp_repr */
-    0, /* tp_as_number */
-    &DimList_seq, /* tp_as_sequence */
-    &DimList_mapping, /* tp_as_mapping */
-    0, /* tp_hash */
-    0, /* tp_call */
-    0, /* tp_str */
-    0, /* tp_getattro */
-    0, /* tp_setattro */
-    0, /* tp_as_buffer */
-    0, /* tp_flags */
-    "DimList Object", /* tp_doc */
-    0, /* tp_traverse */
-    0, /* tp_clear */
-    0, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    0, /* tp_iter */
-    0, /* tp_iternext */
-    DimList_methods, /* tp_methods */
-    0, /* tp_members */
-    DimList_getsetters, /* tp_getset */
-    0, /* tp_base */
-    0, /* tp_dict */
-    0, /* tp_descr_get */
-    0, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    (initproc)DimList_init, /* tp_init */
-    0, /* tp_alloc */
-    DimList::new_stub, /* tp_new */
-};
-
-static int DimList_init(DimList* self, PyObject* args, PyObject* kwds) {
-  PY_BEGIN
-  static constexpr const char* kwlist[] = {"len_or_dims", "name", nullptr};
-  mpy::handle len_or_dims = nullptr;
-  PyObject* name = nullptr;
-  if (!PyArg_ParseTupleAndKeywords(
-          args, kwds, "|OO", const_cast<char**>(kwlist), &len_or_dims, &name)) {
-    return -1;
-  }
-  self->init(mpy::object::borrow(name ? name : Py_None));
-  if (len_or_dims.ptr()) {
-    if (mpy::is_int(len_or_dims)) {
-      self->bind_len(mpy::to_int(len_or_dims));
-    } else if (mpy::is_sequence(len_or_dims)) {
-      mpy::sequence_view s(len_or_dims);
-      std::vector<mpy::obj<Dim>> dims;
-      size_t size = s.size();
-      dims.reserve(size);
-      for (size_t i = 0; i < size; ++i) {
-        auto r = s[i];
-        if (mpy::is_int(r)) {
-          dims.emplace_back(Dim::create(
-              mpy::unicode_from_format("%S%i", self->name_.ptr(), (int)i),
-              mpy::to_int(r)));
-        } else {
-          dims.emplace_back(Dim::wrap(r));
-        }
-      }
-      self->set_dims(std::move(dims));
-    } else {
-      PyErr_Format(
-          PyExc_ValueError, "expected a length or a sequence of dimensions");
-      return -1;
-    }
-    return 0;
-  }
-  return 0;
-  PY_END(-1);
-}
-
-// Tensor -----------------------------
-
-PyTypeObject* TensorType = nullptr; // the python wrapper type.
-mpy::object run_torch_function(
-    Arena& A,
-    mpy::handle orig,
-    mpy::vector_args args,
-    bool is_pointwise);
-
-namespace {
-
-at::Tensor _add_batch_dims(Arena& A, at::Tensor t, Slice<DimEntry> levels_) {
-  auto levels = Slice<DimEntry>();
-  levels.extend(A, levels_);
-  while (true) {
-    int64_t min_real_index = -1;
-    int64_t min_index = -1;
-    int64_t min_value = INT_MAX;
-    int64_t i = 0;
-    int64_t r = 0;
-    for (auto l : levels) {
-      if (!l.is_none()) {
-        if (!l.is_positional() && l.dim()->level_ < min_value) {
-          min_value = l.dim()->level_;
-          min_index = i;
-          min_real_index = r;
-        }
-        ++i;
-      }
-      ++r;
-    }
-    if (min_index == -1) {
-      return t;
-    }
-    auto t2 = at::functorch::addBatchDim(std::move(t), min_index, min_value);
-    t = std::move(t2);
-    levels[min_real_index] = DimEntry();
-  }
-}
-
-struct DelayedOperator {
-  DelayedOperator(mpy::object o, mpy::vector_args a)
-      : orig(std::move(o)), args(a) {
-    auto all = a.size();
-    // this will outlive the call so
-    // take ownership of temporaries
-    // in vector args
-    auto buf = new mpy::handle[all];
-    memcpy(buf, args.args, sizeof(mpy::handle) * all);
-    args.args = buf;
-    for (auto i : args.enumerate_all()) {
-      Py_INCREF(args.args[i].ptr());
-    }
-    Py_XINCREF(args.kwnames.ptr());
-  }
-  ~DelayedOperator() {
-    for (auto i : args.enumerate_all()) {
-      Py_DECREF(args[i].ptr());
-    }
-    if (args.has_keywords()) {
-      Py_XDECREF(args.kwnames.ptr());
-    }
-    delete[] args.args;
-  }
-  mpy::object orig;
-  mpy::vector_args args;
-};
-
-void free_levels_dims(Slice<DimEntry> levels) {
-  for (auto e : levels) {
-    if (!e.is_positional()) {
-      mpy::object::steal(e.dim());
-    }
-  }
-}
-} // namespace
-
-struct Tensor : public mpy::base<Tensor> {
- private:
-  at::Tensor tensor_;
-  at::Tensor batchtensor_;
-  OwnedSlice<DimEntry> levels_;
-  bool has_device_;
-  std::unique_ptr<DelayedOperator> delayed_;
-
- public:
-  at::Tensor& tensor(Arena& A) {
-    if (C10_UNLIKELY(!tensor_.defined())) {
-      AT_ASSERT(delayed_);
-      auto t = Tensor::wrap(
-          run_torch_function(A, delayed_->orig, delayed_->args, true));
-      tensor_ = t->tensor(A);
-      delayed_.reset();
-      // don't force creation of batch tensor if it wasn't already provided.
-      batchtensor_ = t->batchtensor_;
-      AT_ASSERT(levels() == t->levels());
-    }
-    return tensor_;
-  }
-  at::Tensor& batchtensor(Arena& A) {
-    if (C10_UNLIKELY(!batchtensor_.defined())) {
-      batchtensor_ = _add_batch_dims(A, tensor(A), levels_.slice());
-    }
-    return batchtensor_;
-  }
-  Slice<DimEntry> levels() {
-    return levels_.slice();
-  }
-  bool has_device() {
-    return has_device_;
-  }
-  DelayedOperator* delayed() {
-    return delayed_.get();
-  }
-  static PyTypeObject Type;
-
-  static bool check_exact(mpy::handle v) {
-    return Py_TYPE(v.ptr()) == TensorType;
-  }
-
-  static mpy::obj<Tensor> create() {
-    if (!TensorType) {
-      TensorType =
-          (PyTypeObject*)mpy::import("functorch.dim").attr("Tensor").release();
-    }
-    return Tensor::alloc(TensorType);
-  }
-  void capture_levels(Slice<DimEntry> levels) {
-    // grab ownership of the dims inside levels
-    for (auto l : levels) {
-      if (!l.is_positional()) {
-        mpy::object::borrow(l.dim()).release();
-      }
-    }
-    levels_.set(levels, free_levels_dims);
-  }
-  static mpy::object from_positional(
-      Arena& A,
-      at::Tensor tensor,
-      Slice<DimEntry> levels,
-      bool has_device);
-  static mpy::obj<Tensor> create_delayed(
-      mpy::object op,
-      mpy::vector_args args,
-      Slice<DimEntry> levels,
-      bool has_device);
-  friend struct EnableAllLayers;
-};
-
-namespace {
-// version in header does a unnecessary refcount +/-
-at::functorch::BatchedTensorImpl* maybeGetBatchedImpl(
-    const at::Tensor& tensor) {
-  if (at::functorch::isBatchedTensor(tensor)) {
-    return static_cast<at::functorch::BatchedTensorImpl*>(
-        tensor.unsafeGetTensorImpl());
-  }
-  return nullptr;
-}
-
-TensorRef unchecked_tensor_from(mpy::handle p) {
-  auto v = (THPVariable*)p.ptr();
-  return TensorRef(*v->cdata);
-}
-
-static int64_t ndim_of_levels(Slice<DimEntry> levels) {
-  int64_t r = 0;
-  for (auto l : levels) {
-    if (l.is_positional()) {
-      ++r;
-    }
-  }
-  return r;
-}
-
-struct TensorInfo {
-  TensorRef tensor;
-  Slice<DimEntry> levels;
-  bool has_device;
-  TensorRef batchedtensor;
-  int64_t ndim() const {
-    return ndim_of_levels(levels);
-  }
-  operator bool() const {
-    return tensor;
-  }
-
-  static TensorInfo create(
-      Arena& A,
-      mpy::handle h,
-      bool ensure_batched = true,
-      bool ensure_present = true) {
-    if (Tensor::check_exact(h)) {
-      auto t = Tensor::unchecked_wrap(h);
-      return TensorInfo{
-          t->tensor(A),
-          t->levels(),
-          t->has_device(),
-          ensure_batched ? t->batchtensor(A) : TensorRef()};
-    } else if (Dim::check_exact(h)) {
-      auto d = Dim::unchecked_wrap(h);
-      return TensorInfo{
-          d->range(),
-          Slice<DimEntry>(A, DimEntry(d)),
-          false,
-          ensure_batched ? d->batchtensor() : TensorRef()};
-    } else if (THPVariable_Check(h.ptr())) {
-      TensorRef t = unchecked_tensor_from(h);
-      Slice<DimEntry> levels;
-      for (auto i : irange(-t->dim(), 0)) {
-        levels.append(A, i);
-      }
-      return TensorInfo{t, levels, true, t};
-    } else {
-      if (ensure_present) {
-        mpy::raise_error(PyExc_ValueError, "expected a tensor object");
-      }
-      return TensorInfo{};
-    }
-  }
-};
-
-static PyObject* py_Tensor_from_positional(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-#define ARGS(_) \
-  _(mpy::handle, tensor) _(mpy::handle, py_levels) _(int, has_device)
-  MPY_PARSE_ARGS_KWNAMES("OOp", ARGS)
-#undef ARGS
-
-  if (!THPVariable_Check(tensor.ptr())) {
-    mpy::raise_error(PyExc_ValueError, "_tensor is not a Tensor?");
-  }
-
-  Slice<DimEntry> levels;
-  mpy::sequence_view sq(py_levels);
-  for (auto i : sq.enumerate()) {
-    mpy::object v = sq[i];
-    if (mpy::is_int(v)) {
-      auto vi = mpy::to_int(v);
-      levels.append(A, vi);
-    } else {
-      auto dim = Dim::wrap(std::move(v));
-      mpy::hdl<Dim> hdim = dim;
-      levels.append(A, hdim);
-    }
-  }
-  return Tensor::from_positional(
-             A, THPVariable_Unpack(tensor.ptr()), levels, has_device != 0)
-      .release();
-  PY_END(nullptr)
-}
-} // namespace
-
-mpy::object Tensor::from_positional(
-    Arena& A,
-    at::Tensor tensor,
-    Slice<DimEntry> levels,
-    bool has_device) {
-  size_t seen_dims = 0;
-  int last = 0;
-  // auto sz = tensor.sizes();
-  for (auto i : levels.enumerate()) {
-    auto l = levels[i];
-    if (l.is_positional()) {
-      AT_ASSERT(last == 0 || last + 1 == l.position());
-      last = l.position();
-    } else {
-      mpy::object::borrow(l.dim()).release();
-      // AT_ASSERT(sz[i] == l.dim()->size());
-      ++seen_dims;
-    }
-  }
-  AT_ASSERT(last == 0 || last == -1);
-  if (!seen_dims) {
-    return mpy::object::steal(THPVariable_Wrap(tensor));
-  }
-
-  mpy::obj<Tensor> self = Tensor::create();
-  self->tensor_ = std::move(tensor);
-  AT_ASSERT(self->tensor_.dim() == levels.size());
-  self->levels_.set(levels, free_levels_dims);
-  self->has_device_ = has_device;
-  mpy::object r = std::move(self);
-  return r;
-}
-
-mpy::obj<Tensor> Tensor::create_delayed(
-    mpy::object op,
-    mpy::vector_args args,
-    Slice<DimEntry> levels,
-    bool has_device) {
-  mpy::obj<Tensor> self = Tensor::create();
-  self->capture_levels(levels);
-  self->has_device_ = has_device;
-  self->delayed_ = std::make_unique<DelayedOperator>(std::move(op), args);
-  return self;
-}
-
-namespace {
-mpy::list slice_to_list(Slice<mpy::handle> h) {
-  mpy::list lst(h.size());
-  for (auto i : h.enumerate()) {
-    lst.set(i, mpy::object::borrow(h[i]));
-  }
-  return lst;
-}
-
-mpy::tuple slice_to_tuple(Slice<mpy::handle> h) {
-  mpy::tuple lst(h.size());
-  for (auto i : h.enumerate()) {
-    lst.set(i, mpy::object::borrow(h[i]));
-  }
-  return lst;
-}
-
-enum UType {
-  U_ELEM,
-  U_TUPLE_LIKE,
-  U_DICT,
-};
-
-struct Unflatten {
-  mpy::object operator()(Slice<mpy::handle>& elements) {
-    mpy::object r;
-    switch (type) {
-      case U_ELEM: {
-        r = mpy::object::borrow(elements[0]);
-        elements = elements.slice(1);
-      } break;
-      case U_TUPLE_LIKE: {
-        mpy::tuple tup(children.size());
-        for (auto i : children.enumerate()) {
-          tup.set(i, children[i](elements));
-        }
-        r = obj.call(tup);
-      } break;
-      case U_DICT: {
-        r = mpy::object::checked_steal(PyDict_New());
-        mpy::dict_view rv(r);
-        mpy::dict_view d(obj);
-        Py_ssize_t pos = 0;
-        mpy::handle k, v;
-        for (int i = 0; d.next(&pos, &k, &v); ++i) {
-          rv.set(k, children[i](elements));
-        }
-      } break;
-    }
-    return r;
-  }
-  UType type;
-  mpy::handle obj;
-  Slice<Unflatten> children;
-};
-
-Unflatten tree_flatten(
-    Arena& A,
-    mpy::handle agg,
-    Slice<mpy::handle>& flat_elements) {
-  Slice<Unflatten> c;
-  UType utype;
-  mpy::handle obj;
-  if (mpy::list_view::check(agg)) {
-    obj = agg.type();
-    utype = U_TUPLE_LIKE;
-    mpy::list_view l(agg);
-    for (auto i : l.enumerate()) {
-      c.append(A, tree_flatten(A, l[i], flat_elements));
-    }
-  } else if (mpy::tuple_view::check(agg)) {
-    obj = agg.type();
-    utype = U_TUPLE_LIKE;
-    // includes named tuples
-    mpy::tuple_view l(agg);
-    for (auto i : l.enumerate()) {
-      c.append(A, tree_flatten(A, l[i], flat_elements));
-    }
-  } else if (mpy::dict_view::check(agg)) {
-    utype = U_DICT;
-    mpy::dict_view d(agg);
-    obj = agg;
-    Py_ssize_t pos = 0;
-    mpy::handle k, v;
-    while (d.next(&pos, &k, &v)) {
-      c.append(A, tree_flatten(A, v, flat_elements));
-    }
-  } else {
-    utype = U_ELEM;
-    flat_elements.append(A, agg);
-  }
-  return Unflatten{utype, obj, c};
-}
-
-struct UnflattenVectorArgs {
-  mpy::vector_args operator()(Arena& A, Slice<mpy::handle>& elements) {
-    if (!had_nested) {
-      auto args = elements.begin();
-      elements = Slice<mpy::handle>();
-      return mpy::vector_args(args, nargs, kwnames);
-    }
-    Slice<mpy::handle> args;
-    for (auto u : children) {
-      args.append(A, A.autorelease(u(elements)));
-    }
-    return mpy::vector_args(args.begin(), nargs, kwnames);
-  }
-  Slice<Unflatten> children;
-  Py_ssize_t nargs;
-  mpy::handle kwnames;
-  bool had_nested;
-};
-
-UnflattenVectorArgs tree_flatten(
-    Arena& A,
-    mpy::vector_args args,
-    Slice<mpy::handle>& flat_elements) {
-  UnflattenVectorArgs r;
-  r.kwnames = args.kwnames;
-  r.nargs = args.nargs;
-  r.had_nested = false;
-  auto N = args.size();
-  for (auto i : irange(N)) {
-    auto typ = Py_TYPE(args[i].ptr());
-    // fast checks that this thing isn't something that is nested.
-    bool is_element = !typ->tp_as_sequence || typ == torch_Tensor ||
-        typ == TensorType || typ == DimType;
-    if (!is_element) {
-      flat_elements.extend(A, args.args, args.args + i);
-      for (auto j : irange(i)) {
-        (void)j;
-        r.children.append(A, Unflatten{U_ELEM});
-      }
-      for (auto j : irange(i, N)) {
-        r.children.append(A, tree_flatten(A, args[j], flat_elements));
-        if (r.children.back().type != U_ELEM) {
-          r.had_nested = true;
-        }
-      }
-      return r;
-    }
-  }
-  flat_elements.extend(A, args.args, args.args + N);
-  return r;
-}
-
-struct UnflattenArena {
-  Arena A;
-  Unflatten unflatten;
-};
-
-PyObject* py_unflatten(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-#define ARGS(_) _(mpy::handle, ns)
-  MPY_PARSE_ARGS_KWNAMES("O", ARGS)
-#undef ARGS
-  mpy::sequence_view sv(ns);
-  // because we do not have a autorelase pool yet...
-  Arena A;
-  Slice<mpy::handle> slice;
-  mpy::handle Tuple = (PyObject*)&PyTuple_Type;
-  auto inputs = Tuple.call(ns);
-  mpy::tuple_view tv(inputs);
-  for (auto i : tv.enumerate()) {
-    slice.append(A, tv[i]);
-  }
-  auto AA = (UnflattenArena*)PyCapsule_GetPointer(self, "arena");
-  auto r = AA->unflatten(slice).release();
-  AT_ASSERT(r != nullptr);
-  return r;
-  PY_END(nullptr)
-}
-
-PyMethodDef py_unflatten_def = {
-    "unflatten",
-    (PyCFunction)(void*)py_unflatten,
-    METH_FASTCALL | METH_KEYWORDS};
-
-void free_unflatten_arena(PyObject* pc) {
-  delete (UnflattenArena*)PyCapsule_GetPointer(pc, "arena");
-}
-
-PyObject* py_tree_flatten(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-#define ARGS(_) _(mpy::handle, tree)
-  MPY_PARSE_ARGS_KWNAMES("O", ARGS)
-#undef ARGS
-  auto A = new UnflattenArena;
-  Slice<mpy::handle> elements;
-  A->unflatten = tree_flatten(A->A, tree, elements);
-  auto cap = mpy::object::checked_steal(
-      PyCapsule_New(A, "arena", free_unflatten_arena));
-  auto unflatten = mpy::object::checked_steal(
-      PyCFunction_New(&py_unflatten_def, cap.release()));
-  mpy::tuple r(2);
-  r.set(0, slice_to_list(elements));
-  r.set(1, std::move(unflatten));
-  return r.release();
-  PY_END(nullptr)
-}
-
-mpy::object tree_map(
-    Arena& A,
-    const std::function<mpy::handle(mpy::handle)>& fn,
-    mpy::handle agg) {
-  Slice<mpy::handle> elements;
-  auto unflatten = tree_flatten(A, agg, elements);
-  for (auto i : elements.enumerate()) {
-    elements[i] = fn(elements[i]);
-  }
-  return unflatten(elements);
-}
-
-// prereq: isinstance(h, _Tensor)
-int64_t _Tensor_ndim(mpy::handle h) {
-  if (Tensor::check(h)) {
-    int64_t r = 0;
-    for (auto l : Tensor::unchecked_wrap(h)->levels()) {
-      if (l.is_positional()) {
-        ++r;
-      }
-    }
-    return r;
-  }
-  // Dim or DelayedMulTensor
-  return 0;
-}
-
-mpy::handle handle_from_tensor(Arena& A, TensorRef t) {
-  // fast case: tensor is live in python
-  std::optional<PyObject*> mb_obj =
-      t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
-  if (mb_obj.has_value() &&
-      !t->unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) {
-    return *mb_obj;
-  }
-  return A.autorelease(mpy::object::checked_steal(THPVariable_Wrap(*t)));
-}
-} // namespace
-struct EnableAllLayers {
-  EnableAllLayers(Arena& A, Slice<DimEntry> levels) {
-    std::vector<std::pair<int64_t, int64_t>> layers;
-    layers.reserve(levels.size());
-    for (auto l : levels) {
-      if (!l.is_positional()) {
-        auto d = l.dim();
-        levels_to_dim_.append(A, d);
-      }
-    }
-    std::sort(
-        levels_to_dim_.begin(),
-        levels_to_dim_.end(),
-        [](mpy::hdl<Dim> lhs, mpy::hdl<Dim> rhs) {
-          return lhs->level_ < rhs->level_;
-        });
-
-    for (auto i : levels_to_dim_.enumerate()) {
-      auto batch_size = levels_to_dim_[i]->size();
-      auto level = at::functorch::initAndPushDynamicLayer(
-          at::functorch::TransformType::Vmap,
-          batch_size,
-          at::functorch::RandomnessType::Different);
-      if (i == 0) {
-        levels_start_ = level;
-      }
-    }
-  }
-
-  ~EnableAllLayers() {
-    auto to_remove = levels_start_ + levels_to_dim_.size() - 1;
-    for (auto i : levels_to_dim_.enumerate()) {
-      AT_ASSERT(
-          at::functorch::popDynamicLayerAndDeleteMetadata().layerId() ==
-          to_remove - i);
-    }
-  }
-
-  mpy::obj<Tensor> from_batched(
-      Arena& A,
-      at::Tensor batchedtensor,
-      bool has_device) {
-    Slice<DimEntry> levels;
-    for (auto i : irange(-batchedtensor.dim(), 0)) {
-      levels.append(A, i);
-    }
-    TensorRef tensor;
-    at::functorch::BatchedTensorImpl* impl = maybeGetBatchedImpl(batchedtensor);
-    while (true) {
-      auto level = impl->level();
-      AT_ASSERT(
-          level >= levels_start_ &&
-          level < levels_start_ + levels_to_dim_.size());
-      mpy::hdl<Dim> dim = levels_to_dim_[level - levels_start_].ptr();
-      levels.insert(A, impl->bdim(), dim);
-      at::functorch::BatchedTensorImpl* nimpl =
-          maybeGetBatchedImpl(impl->value());
-      if (!nimpl) {
-        tensor = impl->value();
-        break;
-      }
-      impl = nimpl;
-    }
-
-    mpy::obj<Tensor> self = Tensor::create();
-    // grab ownership of the tensors
-    self->tensor_ = *tensor;
-    self->batchtensor_ = std::move(batchedtensor);
-    self->has_device_ = has_device;
-    self->capture_levels(levels);
-    return self;
-  }
-  void inplace_update_layers(TensorRef batchtensor, Slice<DimEntry> levels) {
-    // XXX - requires a patch to functorch to att set_level
-    auto impl = maybeGetBatchedImpl(*batchtensor);
-    for (auto i : levels_to_dim_.reversed_enumerate()) {
-      if (!impl) {
-        break;
-      }
-      if (levels.contains(levels_to_dim_[i])) {
-        impl->_unsafe_set_level(levels_start_ + i);
-        impl = maybeGetBatchedImpl(impl->value());
-      }
-    }
-  }
-
- private:
-  int64_t levels_start_{};
-  Slice<mpy::hdl<Dim>> levels_to_dim_;
-};
-
-namespace {
-TensorRef _match_levels(
-    Arena& A,
-    TensorRef v,
-    Slice<DimEntry> from_levels,
-    Slice<DimEntry> to_levels,
-    bool drop_levels = false) {
-  if (from_levels == to_levels) {
-    return v;
-  }
-  // drop_levels -> if a dim appears in from_levels but not to_levels, it is
-  // assumed it has stride 0.
-  at::IntArrayRef sz = v->sizes();
-  at::IntArrayRef sd = v->strides();
-  AT_ASSERT(drop_levels || from_levels.size() <= to_levels.size());
-  Slice<int64_t> nsz;
-  Slice<int64_t> nsd;
-  for (auto l : to_levels) {
-    auto oidx = from_levels.index(l);
-    if (!oidx) {
-      nsz.append(A, l.is_positional() ? 1 : l.dim()->size());
-      nsd.append(A, 0);
-    } else {
-      auto idx = *oidx;
-      nsz.append(A, sz[idx]);
-      nsd.append(A, sd[idx]);
-    }
-  }
-  return A.autorelease(v->as_strided(
-      at::IntArrayRef(nsz.begin(), nsz.end()),
-      at::IntArrayRef(nsd.begin(), nsd.end()),
-      v->storage_offset()));
-}
-} // namespace
-mpy::object run_torch_function(
-    Arena& A,
-    mpy::handle orig,
-    mpy::vector_args args,
-    bool is_pointwise) {
-  if (!pointwise_optimize) {
-    is_pointwise = false;
-  }
-  // std::cout << "__torch_function__ " << ((is_pointwise) ? "pointwise" :
-  // "functorch") << " " << orig << "\n";
-
-  Slice<mpy::hdl<Dim>> all_dims;
-  Slice<mpy::handle> flat_args;
-  auto unflatten_args = tree_flatten(A, args, flat_args);
-  TensorRef device_holding_tensor;
-
-  Slice<TensorInfo> infos;
-  Slice<DimEntry> result_levels;
-  for (auto f : flat_args) {
-    infos.append(A, TensorInfo::create(A, f, !is_pointwise, false));
-    if (infos.back()) {
-      TensorInfo& info = infos.back();
-      AT_ASSERT(is_pointwise || info.batchedtensor);
-      if (!device_holding_tensor && info.has_device) {
-        device_holding_tensor = infos.back().tensor;
-      }
-      for (auto l : info.levels) {
-        if (!result_levels.contains(l)) {
-          result_levels.append(A, l);
-        }
-      }
-    }
-  }
-
-  if (is_pointwise) {
-    for (auto i : flat_args.enumerate()) {
-      if (infos[i]) {
-        TensorRef tensor = infos[i].tensor;
-        if (device_holding_tensor && !infos[i].has_device) {
-          tensor = A.autorelease(tensor->to(device_holding_tensor->device()));
-        }
-        auto ml = _match_levels(A, tensor, infos[i].levels, result_levels);
-        flat_args[i] = handle_from_tensor(A, std::move(ml));
-      }
-    }
-
-    Slice<mpy::handle> flat_it = flat_args;
-    mpy::vector_args uargs = unflatten_args(A, flat_it);
-
-    mpy::object result = orig.call_vector(uargs);
-
-    // fast wrap for normal case where operator just returns a tensor.
-    if (THPVariable_Check(result.ptr())) {
-      return Tensor::from_positional(
-          A,
-          THPVariable_Unpack(result.ptr()),
-          result_levels,
-          device_holding_tensor);
-    }
-    auto wrap = [&](mpy::handle h) {
-      if (THPVariable_Check(h.ptr())) {
-        return A.autorelease(Tensor::from_positional(
-            A,
-            THPVariable_Unpack(h.ptr()),
-            result_levels,
-            device_holding_tensor));
-      }
-      return h;
-    };
-    return tree_map(A, wrap, result);
-  } else {
-    // std::cout << orig << " calling functorch...\n";
-    // std::cout << "rl: " << result_levels << "\n";
-    EnableAllLayers guard(A, result_levels);
-    for (auto i : flat_args.enumerate()) {
-      if (infos[i]) {
-        TensorRef batched = infos[i].batchedtensor;
-        if (device_holding_tensor && !infos[i].has_device) {
-          batched = A.autorelease(batched->to(device_holding_tensor->device()));
-        }
-        guard.inplace_update_layers(batched, infos[i].levels);
-        flat_args[i] = handle_from_tensor(A, batched);
-      }
-    }
-    Slice<mpy::handle> flat_it = flat_args;
-    mpy::vector_args uargs = unflatten_args(A, flat_it);
-    AT_ASSERT(flat_it.size() == 0);
-    mpy::object result = orig.call_vector(uargs);
-    auto wrap = [&](mpy::handle h) {
-      if (THPVariable_Check(h.ptr())) {
-        return A.autorelease(guard.from_batched(
-            A, THPVariable_Unpack(h.ptr()), device_holding_tensor));
-      }
-      return h;
-    };
-    if (THPVariable_Check(result.ptr())) {
-      return guard.from_batched(
-          A, THPVariable_Unpack(result.ptr()), device_holding_tensor);
-    }
-    return tree_map(A, wrap, result);
-  }
-}
-
-namespace {
-
-mpy::object __torch_function__(
-    Arena& A,
-    mpy::handle orig,
-    mpy::vector_args args,
-    bool is_pointwise) {
-  if (orig == torch_Tensor___mul__) {
-    AT_ASSERT(args.nargs == 2 && !args.has_keywords());
-    auto lhs = args[0];
-    auto rhs = args[1];
-    if (mpy::isinstance(lhs, _Tensor) && mpy::isinstance(rhs, _Tensor) &&
-        _Tensor_ndim(lhs) == 0 && _Tensor_ndim(rhs) == 0) {
-      bool has_device = false;
-      Slice<DimEntry> levels;
-      for (auto i : args.enumerate_positional()) {
-        auto t = TensorInfo::create(A, args[i], false);
-        // something like a mask * rhs, which matrix multiplies don't correctly
-        // promote
-        if (!t.tensor->is_floating_point()) {
-          return run_torch_function(A, orig, args, is_pointwise);
-        }
-        has_device = has_device || t.has_device;
-        for (auto l : t.levels) {
-          if (!levels.contains(l)) {
-            levels.append(A, l);
-          }
-        }
-      }
-      // std::cout << "__torch_function__ " << "delay" << " " << orig << "\n";
-      return Tensor::create_delayed(
-          mpy::object::borrow(orig), args, levels, has_device);
-    }
-  }
-  return run_torch_function(A, orig, args, is_pointwise);
-}
-
-mpy::vector_args as_vector_args(
-    Arena& A,
-    mpy::handle args,
-    mpy::handle kwargs) {
-  auto pos_args = (mpy::handle*)&PyTuple_GET_ITEM(args.ptr(), 0);
-  auto pos_n = PyTuple_GET_SIZE(args.ptr());
-  if (!kwargs.ptr()) {
-    return mpy::vector_args(pos_args, pos_n, nullptr);
-  }
-  Slice<mpy::handle> all_args;
-  Slice<mpy::handle> kwnames;
-  all_args.extend(A, pos_args, pos_args + pos_n);
-  mpy::dict_view dv(kwargs);
-  Py_ssize_t pos = 0;
-  mpy::handle key, value;
-  while (dv.next(&pos, &key, &value)) {
-    all_args.append(A, value);
-    kwnames.append(A, key);
-  }
-  return mpy::vector_args(
-      all_args.begin(), pos_n, A.autorelease(slice_to_tuple(kwnames)));
-}
-
-PyObject* py___torch_function__(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  maybeInitializeGlobals();
-  AT_ASSERT(nargs == 4 || nargs == 5);
-  auto va = as_vector_args(A, args[3], nargs == 5 ? args[4] : nullptr);
-  bool is_pointwise = pointwise.contains(args[1]);
-  return __torch_function__(A, args[1], std::move(va), is_pointwise).release();
-  PY_END(nullptr)
-}
-
-mpy::object levels_to_tuple(Slice<DimEntry> slice) {
-  mpy::tuple t(slice.size());
-  for (auto i : slice.enumerate()) {
-    t.set(
-        i,
-        slice[i].is_positional() ? mpy::from_int(slice[i].position())
-                                 : mpy::object::borrow(slice[i].dim()));
-  }
-  mpy::object r = std::move(t);
-  return r;
-}
-
-PyObject* Tensor_ndim(Tensor* self, void*) {
-  Py_ssize_t i = 0;
-  for (auto l : self->levels()) {
-    if (l.is_positional()) {
-      ++i;
-    }
-  }
-  return mpy::from_int(i).release();
-}
-
-PyGetSetDef Tensor_getsetters[] = {
-    {"_has_device",
-     (getter)[](PyObject* self, void*)
-         ->PyObject* {
-             return mpy::from_bool(((Tensor*)self)->has_device()).release();
-} // namespace
-, NULL
-}
-, {"_tensor", (getter)[](PyObject* self, void*)->PyObject* {Arena A;
-return THPVariable_Wrap(((Tensor*)self)->tensor(A));
-}
-, NULL
-}
-, {"_batchtensor", (getter)[](PyObject* self, void*)->PyObject* {Arena A;
-return THPVariable_Wrap(((Tensor*)self)->batchtensor(A));
-}
-, NULL
-}
-,
-    {"_levels",
-     (getter)[](PyObject* self, void*)
-         ->PyObject* {PY_BEGIN return levels_to_tuple(((Tensor*)self)->levels())
-                          .release();
-PY_END(nullptr)
-}
-}
-, {"ndim", (getter)Tensor_ndim, NULL, "ndim", NULL}, {
-  NULL
-} /* Sentinel */
-}
-;
-
-PyMethodDef Tensor_methods[] = {
-    {NULL, NULL, 0, NULL} /* Sentinel */
-};
-}
-
-PyTypeObject Tensor::Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    "_C.Tensor", /* tp_name */
-    sizeof(Tensor), /* tp_basicsize */
-    0, /* tp_itemsize */
-    Tensor::dealloc_stub, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    0, /* tp_getattr */
-    0, /* tp_setattr */
-    0, /* tp_as_async */
-    0, /* tp_repr */
-    0, /* tp_as_number */
-    0, /* tp_as_sequence */
-    0, /* tp_as_mapping */
-    0, /* tp_hash */
-    0, /* tp_call */
-    0, /* tp_str */
-    0, /* tp_getattro */
-    0, /* tp_setattro */
-    0, /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
-    "Tensor Object", /* tp_doc */
-    0, /* tp_traverse */
-    0, /* tp_clear */
-    0, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    0, /* tp_iter */
-    0, /* tp_iternext */
-    Tensor_methods, /* tp_methods */
-    0, /* tp_members */
-    Tensor_getsetters, /* tp_getset */
-    0, /* tp_base */
-    0, /* tp_dict */
-    0, /* tp_descr_get */
-    0, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    0, /* tp_init */
-    0, /* tp_alloc */
-    Tensor::new_stub, /* tp_new */
-};
-
-// dim() --------------------
-
-static bool relevant_op(_Py_CODEUNIT c) {
-  switch (c) {
-    case STORE_NAME:
-    case STORE_GLOBAL:
-    case STORE_FAST:
-    case STORE_DEREF:
-      return true;
-    default:
-      return false;
-  }
-}
-
-static mpy::object create_dim(mpy::object name, mpy::handle size) {
-  auto d = Dim::create(std::move(name));
-  if (!mpy::is_none(size)) {
-    d->set_size(mpy::to_int(size));
-  }
-  return std::move(d);
-}
-
-static mpy::object create_dimlist(mpy::object name, mpy::handle size) {
-  auto d = DimList::create(std::move(name));
-  if (!mpy::is_none(size)) {
-    if (mpy::is_int(size)) {
-      d->bind_len(mpy::to_int(size));
-    } else {
-      mpy::sequence_view s(size);
-      d->bind_len(s.size());
-      for (auto i : irange(d->size())) {
-        d->dims_[i]->set_size(mpy::to_int(s[i]));
-      }
-    }
-  }
-  return std::move(d);
-}
-
-// Python wrappers that make new reflection primitives available for older
-// runtimes
-#if !(IS_PYTHON_3_11_PLUS)
-#define _PyCode_CODE(CO) ((_Py_CODEUNIT*)PyBytes_AS_STRING((CO)->co_code))
-#endif
-
-namespace {
-struct PyInstDecoder {
-  PyInstDecoder(PyCodeObject* code_object, int lasti)
-      : code_object_(code_object),
-        code_(_PyCode_CODE(code_object)),
-        offset_(lasti / sizeof(_Py_CODEUNIT)) {}
-  // On Windows, _PyOpcode_Caches and _PyOpcode_Deopt are private symbols
-  // See https://github.com/pytorch/pytorch/issues/93854
-  void next() {
-#if IS_PYTHON_3_11_PLUS
-    offset_ += _PyOpcode_Caches[opcode()];
-#endif
-    offset_ += 1;
-  }
-  int opcode() {
-    auto r = _Py_OPCODE(code_[offset_]);
-#if IS_PYTHON_3_11_PLUS
-    r = _PyOpcode_Deopt[r];
-#endif
-    return r;
-  }
-  int oparg() {
-    return _Py_OPARG(code_[offset_]);
-  }
-
-  mpy::object name() {
-    mpy::object names;
-    switch (opcode()) {
-      case STORE_NAME:
-      case STORE_GLOBAL:
-        names = mpy::object::borrow(code_object_->co_names);
-        break;
-      case STORE_FAST:
-        names = mpy::object::steal(PyCode_GetVarnames(code_object_));
-        break;
-      case STORE_DEREF:
-        names = mpy::object::steal(PyCode_GetCellvars(code_object_));
-        break;
-      default:
-        return mpy::object();
-    }
-    return mpy::object::steal(PySequence_GetItem(names.ptr(), oparg()));
-  }
-
- private:
-  PyCodeObject* code_object_;
-  _Py_CODEUNIT* code_;
-  int offset_;
-};
-
-template <mpy::object (*create_object)(mpy::object, mpy::handle)>
-static PyObject* _dims(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  Py_ssize_t specified_ndims = -1;
-  Py_ssize_t found_ndims = 0;
-  Py_ssize_t sizes = -1;
-  mpy::handle n = Py_None;
-  mpy::handle py_sizes = Py_None;
-
-  if (nargs || kwnames) {
-    mpy::vector_args va(args, nargs, kwnames);
-    va.parse("dims", {"n", "sizes"}, {&n, &py_sizes}, 0);
-    if (!mpy::is_none(py_sizes)) {
-      sizes = mpy::sequence_view(py_sizes).size();
-      specified_ndims = sizes;
-    }
-    if (!mpy::is_none(n)) {
-      specified_ndims = mpy::to_int(n);
-    }
-  }
-
-  PyThreadState* state = PyThreadState_GET();
-  auto f = mpy::obj<PyFrameObject>::steal(PyThreadState_GetFrame(state));
-  auto c = mpy::obj<PyCodeObject>::steal(PyFrame_GetCode(f.ptr()));
-  auto lasti = PyFrame_GetLasti(f.ptr());
-  auto decoder = PyInstDecoder(c.ptr(), lasti);
-#if IS_PYTHON_3_11_PLUS
-  // When py3.11 adapts bytecode lasti points to the precall
-  // rather than the call instruction after it
-  if (decoder.opcode() == PRECALL) {
-    decoder.next();
-  }
-#endif
-  decoder.next();
-
-  if (relevant_op(decoder.opcode())) {
-    found_ndims = 1;
-  } else if (decoder.opcode() == UNPACK_SEQUENCE) {
-    found_ndims = decoder.oparg();
-    decoder.next();
-  }
-
-  if (specified_ndims == -1) {
-    if (found_ndims == 0) {
-      mpy::raise_error(
-          PyExc_SyntaxError,
-          "dims() must be assigned to a sequence of variable names or have argument n specified");
-    }
-    specified_ndims = found_ndims;
-  }
-  if (found_ndims != specified_ndims) {
-    found_ndims = 0; // avoid taking the wrong names for dimensions
-  }
-
-  auto genobject = [&](int i) -> mpy::object {
-    mpy::object name;
-    if (i < found_ndims) {
-      name = decoder.name();
-    }
-    if (!name.ptr()) {
-      name = mpy::unicode_from_format("d%d", i);
-      found_ndims = 0; // once we fail at finding a name, we can find any more
-    } else {
-      decoder.next();
-    }
-    return create_object(
-        std::move(name),
-        sizes != -1 ? mpy::sequence_view(py_sizes)[i] : mpy::handle(Py_None));
-  };
-  if (sizes != -1 && sizes != specified_ndims) {
-    mpy::raise_error(
-        PyExc_ValueError,
-        "expected %d sizes but found %d",
-        int(specified_ndims),
-        int(sizes));
-  }
-  if (specified_ndims == 1) {
-    return genobject(0).release();
-  }
-  mpy::tuple result(specified_ndims);
-  for (int i = 0; i < specified_ndims; ++i) {
-    result.set(i, genobject(i));
-  }
-  return result.release();
-  PY_END(nullptr)
-}
-
-struct DotPart {
-  Slice<DimEntry> dims;
-  size_t total_size = 1;
-  void append(Arena& A, mpy::hdl<Dim> d) {
-    total_size *= d->size();
-    dims.append(A, d);
-  }
-};
-
-template <typename T>
-static at::ArrayRef<T> as_array_ref(Slice<T> t) {
-  return at::ArrayRef<T>(t.begin(), t.end());
-}
-
-static TensorRef dot_prepare(
-    Arena& A,
-    std::initializer_list<DotPart> parts,
-    const TensorInfo& t) {
-  Slice<DimEntry> new_levels;
-  bool needs_reshape = false;
-  for (auto p : parts) {
-    if (p.dims.size() != 1) {
-      needs_reshape = true;
-    }
-    new_levels.extend(A, p.dims);
-  }
-  auto r = _match_levels(A, t.tensor, t.levels, new_levels, true);
-  if (!needs_reshape) {
-    return r;
-  }
-  Slice<int64_t> view;
-  for (auto p : parts) {
-    view.append(A, p.total_size);
-  }
-  return A.autorelease(r->reshape(at::IntArrayRef(view.begin(), view.end())));
-}
-
-static mpy::object dot_finish(
-    Arena& A,
-    std::initializer_list<DotPart> parts,
-    at::Tensor r) {
-  Slice<DimEntry> result_levels;
-  bool needs_reshape = false;
-  for (auto p : parts) {
-    if (p.dims.size() != 1) {
-      needs_reshape = true;
-    }
-    result_levels.extend(A, p.dims);
-  }
-  if (needs_reshape) {
-    Slice<int64_t> new_size;
-    for (auto l : result_levels) {
-      new_size.append(A, l.dim()->size());
-    }
-    r = r.reshape(at::IntArrayRef(new_size.begin(), new_size.end()));
-  }
-  return Tensor::from_positional(A, std::move(r), result_levels, true);
-}
-
-static mpy::object dot(
-    Arena& A,
-    TensorInfo lhs,
-    TensorInfo rhs,
-    Slice<DimEntry> sum) {
-  auto lhs_strides = lhs.tensor->strides();
-  auto rhs_strides = rhs.tensor->strides();
-
-  DotPart lro_dims;
-  DotPart lo_dims;
-  DotPart ro_dims;
-  DotPart lr_dims;
-
-  auto insert_dim = [&](mpy::hdl<Dim> d,
-                        std::optional<int> lhs_idx,
-                        std::optional<int> rhs_idx) {
-    bool reduced = sum.contains(d);
-    int64_t lhs_stride = lhs_idx ? lhs_strides[*lhs_idx] : 0;
-    int64_t rhs_stride = rhs_idx ? rhs_strides[*rhs_idx] : 0;
-    if (reduced) {
-      // lr
-      lr_dims.append(A, d);
-    } else {
-      if ((lhs_stride == 0) == (rhs_stride == 0)) {
-        // lro
-        lro_dims.append(A, d);
-      } else if (lhs_stride != 0) {
-        // lo
-        lo_dims.append(A, d);
-      } else {
-        AT_ASSERT(rhs_stride != 0);
-        ro_dims.append(A, d);
-      }
-    }
-  };
-
-  auto rhs_seen = A.allocate<bool>(rhs.levels.size());
-  std::fill(rhs_seen, rhs_seen + rhs.levels.size(), false);
-
-  for (auto i : lhs.levels.enumerate()) {
-    auto d = lhs.levels[i];
-    auto rhs_idx = rhs.levels.index(d);
-    if (rhs_idx) {
-      rhs_seen[*rhs_idx] = true;
-    }
-    insert_dim(d.dim(), i, rhs_idx);
-  }
-
-  for (auto i : rhs.levels.enumerate()) {
-    if (rhs_seen[i]) {
-      continue;
-    }
-    auto d = rhs.levels[i];
-    insert_dim(d.dim(), std::nullopt, i);
-  }
-
-  if (lr_dims.dims.size() != sum.size()) {
-    for (auto& d : sum) {
-      if (!lhs.levels.contains(d) && !rhs.levels.contains(d)) {
-        mpy::raise_error(
-            DimensionBindError(),
-            "summing over non-existent dimension %S",
-            d.dim().ptr());
-      }
-    }
-  }
-
-  // std::cout << lhs.levels << " " << rhs.levels << " " << sum << "\n";
-  // std::cout << lro_dims.dims << " " << lo_dims.dims << " " << ro_dims.dims <<
-  // " " << lr_dims.dims << "\n";
-
-  // no batch, just call mm
-  if (lro_dims.dims.size() != 0) {
-    auto lhs_ = dot_prepare(A, {lro_dims, lo_dims, lr_dims}, lhs);
-    auto rhs_ = dot_prepare(A, {lro_dims, lr_dims, ro_dims}, rhs);
-    return dot_finish(A, {lro_dims, lo_dims, ro_dims}, at::bmm(*lhs_, *rhs_));
-  } else {
-    auto lhs_ = dot_prepare(A, {lo_dims, lr_dims}, lhs);
-    auto rhs_ = dot_prepare(A, {lr_dims, ro_dims}, rhs);
-    return dot_finish(A, {lo_dims, ro_dims}, at::mm(*lhs_, *rhs_));
-  }
-}
-
-static PyObject* test_c(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-
-  Arena A;
-  Slice<int> s(A, 3, 4, 5);
-  AT_ASSERT(s.size() == 3 && s.capacity() == 8);
-  AT_ASSERT(s[0] == 3 && s[1] == 4 && s[2] == 5);
-  s.append(A, 6);
-  AT_ASSERT(s[3] == 6);
-  for (int i : irange(10)) {
-    s.append(A, i);
-  }
-  AT_ASSERT(s[0] == 3 && s.back() == 9 && s.size() == 14 && s.capacity() == 16);
-
-  Slice<int> s2(A, -1, -2, -3);
-  AT_ASSERT(s2[1] == -2 && s[0] == 3);
-
-  auto ss = s.slice(1, 2);
-  AT_ASSERT(ss.size() == 1);
-  AT_ASSERT(ss[0] == 4);
-  AT_ASSERT(ss.capacity() == 1);
-  ss.append(A, -4);
-  AT_ASSERT(ss.size() == 2 && ss[1] == -4);
-  ss[0] = 3;
-  AT_ASSERT(s[1] == 4);
-
-  s.insert(A, s.slice(1, 4), ss);
-  AT_ASSERT(s[1] == 3 && s[2] == -4 && s[3] == 0);
-
-  auto sz = s.size();
-  s.insert(A, s.slice(1, 1), 4);
-  AT_ASSERT(s[1] == 4 && sz + 1 == s.size());
-
-  Slice<int> d(A, 0, 1, 2, 3, 4);
-
-  Slice<int> b(A, 0, 1, 2, 3, 4);
-  b.insert(A, b.slice(1, 1), d);
-  AT_ASSERT(b.size() == 10);
-  AT_ASSERT(b[1] == 0);
-  AT_ASSERT(b[5] == 4);
-  AT_ASSERT(b.back() == 4);
-
-  Py_RETURN_NONE;
-
-  PY_END(nullptr);
-}
-
-static PyObject* order(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  if (kwnames) {
-    mpy::raise_error(
-        PyExc_TypeError, "unexpected keyword arguments %S", kwnames);
-  }
-  AT_ASSERT(nargs-- > 0);
-  Slice<DimEntry> orig_levels;
-  Slice<DimEntry> levels;
-  TensorRef data;
-  mpy::handle self = args++[0];
-  bool has_device;
-  if (Tensor::check_exact(self)) {
-    auto t = Tensor::unchecked_wrap(self);
-    orig_levels = t->levels();
-    data = t->tensor(A);
-    has_device = t->has_device();
-  } else {
-    auto d = Dim::unchecked_wrap(self);
-    orig_levels.append(A, d);
-    data = d->range();
-    has_device = false;
-  }
-
-  Slice<DimEntry> flat_positional_dims;
-  Slice<std::pair<int, int>> to_flatten;
-  levels.extend(A, orig_levels);
-
-  int orig_ndim = ndim_of_levels(levels);
-  auto append = [&](DimEntry d) {
-    auto midx = levels.index(d);
-    if (!midx) {
-      if (d.is_positional()) {
-        mpy::raise_error(
-            PyExc_ValueError,
-            "tensor has %d positional dimensions, but %d specified, or it was specified twice",
-            int(orig_ndim),
-            int(d.position() + orig_ndim));
-      } else {
-        mpy::raise_error(
-            PyExc_ValueError,
-            "tensor of dimensions %R does not contain dim %R or it was specified twice",
-            levels_to_tuple(orig_levels).ptr(),
-            d.dim().ptr());
-      }
-    }
-    levels[*midx] = DimEntry();
-    flat_positional_dims.append(A, d);
-  };
-
-  int n_new_positional = 0;
-  for (auto i : irange(nargs)) {
-    mpy::handle arg = args[i];
-    DimEntry entry = _wrap_dim(arg, orig_ndim, false);
-    if (!entry.is_none()) {
-      append(entry);
-      ++n_new_positional;
-    } else if (DimList::check(arg)) {
-      auto dl = DimList::unchecked_wrap(arg);
-      for (mpy::obj<Dim>& d : dl->dims_) {
-        append(mpy::hdl<Dim>(d));
-        ++n_new_positional;
-      }
-    } else {
-      ++n_new_positional;
-      if (!mpy::is_sequence(arg)) {
-        mpy::raise_error(
-            PyExc_ValueError, "expected a Dim, List[Dim], or Sequence[Dim]");
-      }
-      mpy::sequence_view sq(arg);
-      auto N = sq.size();
-      to_flatten.append(A, std::make_pair(flat_positional_dims.size(), N));
-      for (auto j : irange(N)) {
-        DimEntry e = _wrap_dim(A.autorelease(sq[j]), orig_ndim, false);
-        if (e.is_none()) {
-          mpy::raise_error(PyExc_ValueError, "expected a Dim, or int");
-        }
-        append(e);
-      }
-    }
-  }
-
-  int insert_point = -1;
-  Slice<DimEntry> new_levels;
-  for (auto l : levels) {
-    if (l.is_none()) {
-      continue;
-    }
-    if (l.is_positional()) {
-      if (insert_point == -1) {
-        insert_point = new_levels.size();
-        new_levels.extend(A, flat_positional_dims);
-      }
-    }
-    new_levels.append(A, l);
-  }
-  if (insert_point == -1) {
-    insert_point = new_levels.size();
-    new_levels.extend(A, flat_positional_dims);
-  }
-
-  at::Tensor ndata = *_match_levels(A, data, orig_levels, new_levels);
-
-  if (to_flatten.size()) {
-    Slice<int64_t> view;
-    auto sz = ndata.sizes();
-    // before the new positional dims
-    for (auto i : irange(0, insert_point)) {
-      view.append(A, sz[i]);
-    }
-    int i = 0;
-    for (auto to_flat : to_flatten) {
-      for (; i < to_flat.first; ++i) {
-        view.append(A, sz[insert_point + i]);
-      }
-      int64_t new_size = 1;
-      int last = i + to_flat.second;
-      for (; i < last; ++i) {
-        new_size *= sz[insert_point + i];
-      }
-      view.append(A, new_size);
-    }
-    for (; i < flat_positional_dims.size(); ++i) {
-      view.append(A, sz[insert_point + i]);
-    }
-    // after the new positional dims
-    for (auto i :
-         irange(insert_point + flat_positional_dims.size(), levels.size())) {
-      view.append(A, sz[i]);
-    }
-    // we shorted the number of dimension, so remove them from new levels
-    // we will renumber them later
-    auto n_to_remove = flat_positional_dims.size() - n_new_positional;
-    new_levels.insert(
-        A,
-        new_levels.slice(insert_point, insert_point + n_to_remove),
-        Slice<DimEntry>());
-    ndata = std::move(ndata).reshape(at::IntArrayRef(view.begin(), view.end()));
-  }
-
-  // renumber the positional dimension
-  int seen = 0;
-  for (auto i : new_levels.reversed_enumerate()) {
-    if (new_levels[i].is_positional() ||
-        (i >= insert_point && i < insert_point + n_new_positional)) {
-      new_levels[i] = --seen;
-    }
-  }
-  return Tensor::from_positional(A, std::move(ndata), new_levels, has_device)
-      .release();
-
-  PY_END(nullptr)
-}
-
-static PyObject* expand(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  AT_ASSERT(nargs-- > 0);
-  auto info = TensorInfo::create(A, args++[0], false);
-  for (auto i : irange(nargs)) {
-    if (!Dim::check(args[i])) {
-      maybeInitializeGlobals();
-      mpy::vector_args vargs(args - 1, nargs + 1, kwnames);
-      if (THPVariable_Check(args[-1])) {
-        return torch_Tensor_expand.call_vector(vargs).release();
-      } else {
-        return __torch_function__(A, torch_Tensor_expand, vargs, false)
-            .release();
-      }
-    }
-  }
-  const at::Tensor& data = *info.tensor;
-  auto levels = info.levels;
-  Slice<DimEntry> new_levels;
-  Slice<int64_t> sz;
-  Slice<int64_t> sd;
-  for (auto i : irange(nargs)) {
-    auto d = Dim::unchecked_wrap(args[i]);
-    if (levels.contains(d) || new_levels.contains(d)) {
-      mpy::raise_error(
-          DimensionBindError(),
-          "expanding dimension %R already exists in tensor with dims",
-          d.ptr());
-    }
-    new_levels.append(A, d);
-    sz.append(A, d->size());
-    sd.append(A, 0);
-  }
-  new_levels.extend(A, levels);
-  at::IntArrayRef osz = data.sizes();
-  at::IntArrayRef osd = data.strides();
-  sz.extend(A, osz.begin(), osz.end());
-  sd.extend(A, osd.begin(), osd.end());
-  at::Tensor ndata = data.as_strided(
-      at::IntArrayRef(sz.begin(), sz.end()),
-      at::IntArrayRef(sd.begin(), sd.end()),
-      data.storage_offset());
-  return Tensor::from_positional(
-             A, std::move(ndata), new_levels, info.has_device)
-      .release();
-  PY_END(nullptr)
-}
-
-static void _bind_dims_to_size(
-    Arena& A,
-    int64_t sz,
-    int64_t sd,
-    Slice<mpy::hdl<Dim>> dims,
-    Slice<int64_t>& nsz,
-    Slice<int64_t>& nsd) {
-  int64_t rhs_prod = 1;
-  for (auto i : dims.enumerate()) {
-    if (!dims[i]->is_bound()) {
-      for (auto j : irange(i + 1, dims.size())) {
-        if (!dims[j]->is_bound()) {
-          mpy::raise_error(
-              DimensionBindError(),
-              "cannot infer the sizes of two dimensions at once %R and %R",
-              dims[i].ptr(),
-              dims[j].ptr());
-        }
-        rhs_prod *= dims[j]->size();
-      }
-      if (sz % rhs_prod != 0) {
-        mpy::tuple tup(dims.size());
-        for (auto j : dims.enumerate()) {
-          tup.set(
-              j,
-              dims[j]->is_bound() ? mpy::from_int(dims[j]->size())
-                                  : mpy::unicode_from_string("?"));
-        }
-        mpy::raise_error(
-            DimensionBindError(),
-            "inferred dimension does not evenly fit into larger dimension: %d vs %R",
-            (int)sz,
-            tup.ptr());
-      }
-      int64_t inferred_size = sz / rhs_prod;
-      dims[i]->set_size(inferred_size);
-      rhs_prod = sz;
-      break;
-    }
-    rhs_prod *= dims[i]->size();
-  }
-  if (rhs_prod != sz) {
-    mpy::tuple tup(dims.size());
-    for (auto j : dims.enumerate()) {
-      tup.set(j, mpy::object::borrow(dims[j]));
-    }
-    mpy::raise_error(
-        DimensionBindError(),
-        "Dimension sizes to do not match (%d != %d) when matching dimension pack %R",
-        (int)sz,
-        (int)rhs_prod,
-        tup.ptr());
-  }
-  auto new_strides = A.allocate<int64_t>(dims.size());
-  auto prev_stride = sd;
-  for (auto i : dims.reversed_enumerate()) {
-    new_strides[i] = prev_stride;
-    prev_stride = dims[i]->size() * prev_stride;
-  }
-  for (auto i : dims.enumerate()) {
-    nsd.append(A, new_strides[i]);
-    nsz.append(A, dims[i]->size());
-  }
-}
-
-static bool has_dims(mpy::handle d) {
-  return Dim::check_exact(d) || Tensor::check_exact(d);
-}
-
-struct IndexingInfo {
-  bool can_call_original; // if true, then it is safe to just call getitem or
-                          // setitem, these objects do not need special handling
-  bool advanced_indexing; // requires actual lookup
-  TensorRef self;
-  Slice<mpy::handle> flat_inputs;
-  Slice<DimEntry> result_levels;
-  bool has_device;
-};
-} // namespace
-
-IndexingInfo getsetitem_flat(
-    Arena& A,
-    TensorInfo self_info,
-    Slice<mpy::handle> input,
-    Slice<DimEntry> keys,
-    Slice<mpy::handle> values,
-    bool has_dimpacks_or_none);
-namespace {
-Slice<mpy::handle> as_slice(mpy::tuple_view tv) {
-  PyObject** begin = &PyTuple_GET_ITEM(tv.ptr(), 0);
-  return Slice<mpy::handle>(
-      (mpy::handle*)begin, (mpy::handle*)(begin + tv.size()));
-}
-
-Slice<mpy::handle> as_slice(mpy::list_view tv) {
-  PyObject** begin = &PyList_GET_ITEM(tv.ptr(), 0);
-  return Slice<mpy::handle>(
-      (mpy::handle*)begin, (mpy::handle*)(begin + tv.size()));
-}
-
-bool maybe_dimpack(
-    Slice<mpy::handle>& elements,
-    mpy::handle s,
-    bool check_first = true) {
-  // can we avoid rechecking?
-  if (mpy::list_view::check(s)) {
-    mpy::list_view tv(s);
-    if (!check_first || (tv.size() && Dim::check_exact(tv[0]))) {
-      elements = as_slice(tv);
-      return true;
-    }
-  }
-  // can we avoid rechecking?
-  if (mpy::tuple_view::check(s)) {
-    mpy::tuple_view tv(s);
-    if (!check_first || (tv.size() && Dim::check_exact(tv[0]))) {
-      elements = as_slice(tv);
-      return true;
-    }
-  }
-  return false;
-};
-
-bool is_dimpack(mpy::handle s) {
-  Slice<mpy::handle> e;
-  return maybe_dimpack(e, s);
-}
-
-mpy::object invoke_getitem(Arena& A, const IndexingInfo& iinfo) {
-  at::Tensor rtensor;
-  if (iinfo.advanced_indexing) {
-    auto self_hdl = handle_from_tensor(A, iinfo.self);
-    auto tup = slice_to_tuple(iinfo.flat_inputs);
-    // std::cout << "calling original getindex " << self_hdl << " " << tup <<
-    // "\n";
-    auto pytensor = mpy::object::checked_steal(
-        THPVariable_getitem(self_hdl.ptr(), tup.ptr()));
-    rtensor = THPVariable_Unpack(pytensor.ptr());
-  } else {
-    // std::cout << "skipping original getindex\n";
-    rtensor = *iinfo.self;
-  }
-  // std::cout << "returning (from_positional)\n";
-  return Tensor::from_positional(
-      A, std::move(rtensor), iinfo.result_levels, iinfo.has_device);
-}
-
-mpy::object index(
-    Arena& A,
-    mpy::handle self,
-    mpy::handle dims,
-    mpy::handle indices) {
-  maybeInitializeGlobals();
-  Slice<mpy::handle> dims_list;
-  Slice<mpy::handle> indices_list;
-  // we allow for matching single dims to multiple dims,
-  // so we first have to normalize everything into the case where there is a
-  // list on lhs and the rhs
-  bool lhs_list = mpy::tuple_view::check(dims) || mpy::list_view::check(dims);
-  bool rhs_list =
-      mpy::tuple_view::check(indices) || mpy::list_view::check(indices);
-  if (lhs_list && rhs_list) {
-    mpy::sequence_view dv(dims);
-    mpy::sequence_view ind(indices);
-    Py_ssize_t N = dv.size();
-    if (N != ind.size()) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "dims (%d) and indices (%d) must have the same length",
-          int(N),
-          int(ind.size()));
-    }
-    for (auto i : irange(N)) {
-      dims_list.append(A, A.autorelease(dv[i]));
-      indices_list.append(A, A.autorelease(ind[i]));
-    }
-  } else {
-    dims_list.append(A, dims);
-    indices_list.append(A, indices);
-  }
-
-  // dims being indexed can be grouped together into a single index space, and
-  // we have to flatten them int a single dimension before we can index them...
-  auto self_info = TensorInfo::create(A, self, false);
-  auto ndim = self_info.ndim();
-  Slice<DimEntry> new_levels;
-  Slice<DimEntry> to_flatten;
-  Slice<DimEntry> dims_list_flat;
-  auto parse_dim_entry = [&](mpy::handle s) -> DimEntry {
-    auto d = _wrap_dim(s, ndim, false);
-    if (d.is_none()) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "expected a dimension specifyer but found %R",
-          s.ptr());
-    }
-    return d;
-  };
-  auto dim_not_present = [&](DimEntry d) {
-    if (d.is_positional()) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "dimension %d not in tensor of %d dimensions",
-          d.position() + ndim,
-          ndim);
-    } else {
-      mpy::raise_error(
-          PyExc_TypeError, "dimension %R not in tensor", d.dim()->ptr());
-    }
-  };
-
-  for (auto i : dims_list.enumerate()) {
-    Slice<mpy::handle> m;
-    if (maybe_dimpack(m, dims_list[i], /*check_first=*/false)) {
-      if (m.size() == 0) {
-        // plausible semantics work for this to have 0 elements (e.g. the index
-        // will always be 0)
-        dims_list_flat.append(A, DimEntry()); // value is just dropped
-      }
-      auto first = parse_dim_entry(m[0]);
-      dims_list_flat.append(A, first);
-      if (m.size() == 1) {
-        continue;
-      }
-      if (to_flatten.size() == 0) {
-        new_levels.extend(A, self_info.levels);
-      }
-      Slice<DimEntry> rest;
-      for (auto i : irange(1, m.size())) {
-        auto d = parse_dim_entry(m[i]);
-        if (!new_levels.remove(A, d)) {
-          dim_not_present(d);
-        }
-        rest.append(A, d);
-      }
-
-      auto first_idx = new_levels.index(first);
-      if (!first_idx) {
-        dim_not_present(first);
-      }
-      new_levels.insert(
-          A, new_levels.slice(*first_idx + 1, *first_idx + 1), rest);
-      to_flatten.extend(A, rest);
-    } else {
-      dims_list_flat.append(A, parse_dim_entry(dims_list[i]));
-    }
-  }
-  if (to_flatten.size() > 0) {
-    TensorRef rearranged =
-        _match_levels(A, self_info.tensor, self_info.levels, new_levels);
-    at::IntArrayRef sizes = rearranged->sizes();
-    Slice<int64_t> new_sizes;
-    Slice<DimEntry> reshape_levels;
-    for (auto i : new_levels.enumerate()) {
-      if (to_flatten.contains(new_levels[i])) {
-        new_sizes.back() *= sizes[i];
-      } else {
-        new_sizes.append(A, sizes[i]);
-        reshape_levels.append(A, new_levels[i]);
-      }
-    }
-    self_info.tensor = A.autorelease(rearranged->reshape(
-        at::IntArrayRef(new_sizes.begin(), new_sizes.end())));
-
-    self_info.levels =
-        reshape_levels; // note: we are using the first level in a flattened
-                        // group to represent the group for the rest of the op
-                        // we need to be careful not to rely the dimensions size
-                        // because it doesn't match the size of the whole group
-  }
-  bool has_dimpacks = false;
-  for (auto idx : indices_list) {
-    if (mpy::tuple_view::check(idx) || mpy::list_view::check(idx)) {
-      has_dimpacks = true;
-      break;
-    }
-  }
-  IndexingInfo info = getsetitem_flat(
-      A,
-      self_info,
-      Slice<mpy::handle>(),
-      dims_list_flat,
-      indices_list,
-      has_dimpacks);
-  return invoke_getitem(A, info);
-}
-
-// true -- the indices were flattened out of a tuple, list or sequence...
-
-Slice<mpy::handle> slice_from_sequence(Arena& A, mpy::handle value) {
-  if (mpy::tuple_view::check(value)) {
-    return as_slice(mpy::tuple_view(value));
-  } else if (mpy::list_view::check(value)) {
-    return as_slice(mpy::list_view(value));
-  } else {
-    mpy::sequence_view sv(value);
-    Slice<mpy::handle> r;
-    for (auto i : sv.enumerate()) {
-      r.append(A, A.autorelease(sv[i]));
-    }
-    return r;
-  }
-}
-
-bool extractIndices(Arena& A, mpy::handle index, Slice<mpy::handle>& indices) {
-  if (mpy::tuple_view::check(index)) {
-    indices.extend(A, as_slice(mpy::tuple_view(index)));
-    return true;
-  } else if (THPVariable_Check(index.ptr())) {
-    indices.append(A, index);
-    return false;
-  } else if (!mpy::is_sequence(index)) {
-    indices.append(A, index);
-    return false;
-  }
-  // a copy of treatSequenceAsTuple modified to add Dim and our wrapped
-  // tensors..
-  mpy::sequence_view sv(index);
-  if (sv.size() >= 32) {
-    indices.extend(A, slice_from_sequence(A, index));
-    return true;
-  }
-  for (auto i : sv.enumerate()) {
-    mpy::handle item;
-    try {
-      item = sv[i];
-    } catch (mpy::exception_set& e) {
-      PyErr_Clear();
-      indices.append(A, index);
-      return false;
-    }
-    if (THPVariable_Check(item.ptr()) || mpy::is_sequence(item) ||
-        PySlice_Check(item.ptr()) || item.ptr() == Py_Ellipsis ||
-        mpy::is_none(item) || has_dims(item)) {
-      indices.extend(A, slice_from_sequence(A, index));
-      return true;
-    }
-  }
-  indices.append(A, index);
-  return false;
-}
-
-IndexingInfo getsetitem(
-    Arena& A,
-    mpy::handle self,
-    mpy::handle index,
-    bool tensors_have_dims) {
-  bool can_call_original_getitem = !tensors_have_dims;
-
-  Slice<mpy::handle> input;
-  if (has_dims(index)) {
-    input.append(A, index);
-  } else {
-    bool is_sequence = extractIndices(A, index, input);
-    // nothing about first class dims here, fallback to getitem
-    if (can_call_original_getitem && !is_sequence) {
-      return {true};
-    }
-  }
-
-  int64_t dims_indexed = 0;
-  int64_t expanding_object = -1;
-  DimList* unbound_dim_list = nullptr;
-  auto check_expanding = [&](int64_t i) {
-    if (expanding_object != -1) {
-      mpy::raise_error(
-          DimensionBindError(),
-          "at most one ... or unbound dimension list can exist in indexing list but found 2 at offsets %d and %d",
-          (int)expanding_object,
-          (int)i);
-    }
-    expanding_object = i;
-  };
-  Slice<int64_t> dimlists;
-
-  // calculate how many dimensioned have been indexed in order to compute the
-  // size of ... or expand a potentially unbound dimension list.
-
-  bool has_dimpacks_or_none = false;
-  for (auto i : input.enumerate()) {
-    mpy::handle s = input[i];
-    if (Dim::check_exact(s) || Tensor::check_exact(s)) {
-      can_call_original_getitem = false;
-      ++dims_indexed;
-    } else if (s.ptr() == Py_Ellipsis) {
-      check_expanding(i);
-    } else if (DimList::check(s)) {
-      can_call_original_getitem = false;
-      auto dl = DimList::unchecked_wrap(s);
-      if (!dl->is_bound()) {
-        check_expanding(i);
-        unbound_dim_list = dl.ptr();
-      } else {
-        dims_indexed += dl->dims_.size();
-      }
-      dimlists.append(A, i);
-    } else if (mpy::is_none(s)) {
-      has_dimpacks_or_none = true;
-    } else if (is_dimpack(s)) {
-      can_call_original_getitem = false;
-      has_dimpacks_or_none = true;
-      ++dims_indexed;
-    } else {
-      ++dims_indexed;
-    }
-  }
-
-  // at this point if we haven't seen any Dim objects, we also can fallback to
-  // the original getitem.
-  if (can_call_original_getitem) {
-    return {true};
-  }
-
-  // std::cout << "__getitem__ " << self << " " << index << "\n";
-
-  TensorInfo self_info = TensorInfo::create(A, self, false, true);
-  auto ndim = self_info.ndim();
-  if (dims_indexed > ndim) {
-    mpy::raise_error(
-        PyExc_ValueError,
-        "at least %d indices were supplied but the tensor only has %d dimensions",
-        (int)dims_indexed,
-        (int)ndim);
-  }
-  // expand any unbound dimension list, or expand ... into individual : slices.
-  auto expanding_dims = ndim - dims_indexed;
-  if (expanding_object != -1) {
-    if (unbound_dim_list) {
-      unbound_dim_list->bind_len(expanding_dims);
-    } else {
-      // ...
-      Slice<mpy::handle> no_slices;
-      for (auto i : irange(expanding_dims)) {
-        (void)i;
-        no_slices.append(A, no_slice);
-      }
-      input.insert(
-          A, input.slice(expanding_object, expanding_object + 1), no_slices);
-    }
-  }
-
-  // flatten out any dimensions stored in dimlist elements directly into the
-  // inputs std::cout << dimlists << " <- dim lists!\n";
-  for (int64_t i = dimlists.size() - 1; i >= 0; --i) {
-    auto idx = dimlists[i];
-    // we added more elements to input because of ...
-    // so we need to also adjust the index to get back to where the
-    // dimlist existed
-    if (!unbound_dim_list && expanding_object != -1 && idx > expanding_object) {
-      idx += expanding_dims;
-    }
-    auto dl = DimList::unchecked_wrap(input[idx]);
-    // XXX would be better if we used an OwnedSlice in DimList
-    Slice<mpy::handle> more_dims(
-        (mpy::handle*)&*dl->dims_.begin(), (mpy::handle*)&*dl->dims_.end());
-    input.insert(A, input.slice(idx, idx + 1), more_dims);
-  }
-
-  return getsetitem_flat(
-      A,
-      self_info,
-      input,
-      Slice<DimEntry>(),
-      Slice<mpy::handle>(),
-      has_dimpacks_or_none);
-}
-} // namespace
-IndexingInfo getsetitem_flat(
-    Arena& A,
-    TensorInfo self_info,
-    Slice<mpy::handle> input,
-    Slice<DimEntry> keys,
-    Slice<mpy::handle> values,
-    bool has_dimpacks_or_none) {
-  // At this point:
-  // ..., DimList have been eliminated
-  // Dim, Tensor, Tuple[Dim,...], int, slice still remain
-
-  // we have to count how many times we see a dimension.
-  // A[i,j] is a simple binding operation, but A[i, i+j] or A[i, i] requires
-  // advanced indexing.
-  Slice<mpy::hdl<Dim>> seen_dims;
-  Slice<int64_t> seen_dims_nuses;
-  auto add_dim = [&](mpy::hdl<Dim> entry) {
-    auto midx = seen_dims.index(entry);
-    if (!midx) {
-      seen_dims.append(A, entry);
-      seen_dims_nuses.append(A, 1);
-    } else {
-      ++seen_dims_nuses[*midx];
-    }
-  };
-
-  Slice<mpy::handle> input_it = input;
-
-  Slice<mpy::handle> flat_inputs;
-  // flat inputs will start with an empty mpy::handle if the
-  // actual value is in the tensor-like object in the tensor info
-  Slice<TensorInfo> tensor_inputs;
-
-  auto append_flat_handle = [&](mpy::handle h) {
-    flat_inputs.append(A, h);
-    tensor_inputs.append(A, TensorInfo());
-  };
-  TensorRef device_holding_tensor;
-  auto append_tensor_input = [&](TensorInfo ti) {
-    flat_inputs.append(A, mpy::handle());
-    tensor_inputs.append(A, ti);
-    if (ti.has_device && !device_holding_tensor) {
-      device_holding_tensor = ti.tensor;
-    }
-  };
-
-  Slice<int64_t> nsz;
-  Slice<int64_t> nsd;
-  at::IntArrayRef sz = self_info.tensor->sizes();
-  at::IntArrayRef sd = self_info.tensor->strides();
-
-  auto append_size = [&](int i) {
-    if (has_dimpacks_or_none) {
-      nsz.append(A, sz[i]);
-      nsd.append(A, sd[i]);
-    }
-  };
-  // std::cout << "self levels: " << self_info.levels << "\n";
-
-  auto parse_nones = [&]() {
-    while (input_it.size() && mpy::is_none(input_it[0])) {
-      append_flat_handle(no_slice);
-      nsz.append(A, 1);
-      nsd.append(A, 0);
-      input_it = input_it.slice(1);
-    }
-  };
-
-  auto append_item = [&](int i, mpy::handle arg) {
-    if (Dim::check_exact(arg)) {
-      auto d = Dim::unchecked_wrap(arg);
-      d->set_size(sz[i]);
-      add_dim(d);
-      append_size(i);
-      append_flat_handle(arg);
-      return;
-    }
-    auto info = TensorInfo::create(A, arg, false, false);
-    if (info) {
-      append_size(i);
-      append_tensor_input(info);
-      for (auto il : info.levels) {
-        if (!il.is_positional()) {
-          add_dim(il.dim());
-        }
-      }
-      return;
-    }
-
-    if (has_dimpacks_or_none) {
-      Slice<mpy::handle> mp;
-      if (maybe_dimpack(mp, arg)) {
-        // dim pack
-        Slice<mpy::hdl<Dim>> dim_pack;
-        for (auto d : mp) {
-          dim_pack.append(A, Dim::wrap(d));
-          add_dim(dim_pack.back());
-          append_flat_handle(dim_pack.back());
-        }
-        _bind_dims_to_size(A, sz[i], sd[i], dim_pack, nsz, nsd);
-        return;
-      }
-    }
-
-    append_size(i);
-    append_flat_handle(arg);
-  };
-
-  // pair up the indexing expressions with dimension of self it indexes
-  // self may have first-class dims, which do not participate the indexing.
-  for (auto i : self_info.levels.enumerate()) {
-    auto l = self_info.levels[i];
-    auto idx = keys.index(l);
-    if (idx) {
-      append_item(i, values[*idx]);
-    } else if (l.is_positional()) {
-      // grab and index from the positional list
-      parse_nones();
-      if (!input_it.size()) {
-        // we might have fewer indices than tensor dimensions,
-        // which implicitly indexes the remaining dimensions with :
-        append_flat_handle(no_slice);
-        append_size(i);
-      } else {
-        mpy::handle arg = input_it[0];
-        input_it = input_it.slice(1);
-        append_item(i, arg);
-      }
-    } else {
-      add_dim(l.dim());
-      append_flat_handle(l.dim());
-      append_size(i);
-    }
-  }
-  // any training Nones may have no existing dimension associated with them in
-  // self.
-  parse_nones();
-
-  // we have to restride the tensor to collapse dimension packs and introduce
-  // our none dimensions.
-  if (has_dimpacks_or_none) {
-    self_info.tensor = A.autorelease(self_info.tensor->as_strided(
-        at::IntArrayRef(nsz.begin(), nsz.end()),
-        at::IntArrayRef(nsd.begin(), nsd.end()),
-        self_info.tensor->storage_offset()));
-  }
-
-  // figure out what the shape of the indexing tensors will be
-  // and what the shape of the resulting tensor will be
-  Slice<DimEntry> result_levels;
-  Slice<DimEntry> index_levels;
-  int64_t tensor_insert_point = -1;
-  bool requires_getindex = false;
-  auto mark_tensor_index = [&] {
-    if (tensor_insert_point == -1) {
-      tensor_insert_point = result_levels.size();
-    } else if (tensor_insert_point != result_levels.size()) {
-      tensor_insert_point = 0;
-    }
-  };
-  for (auto i : flat_inputs.enumerate()) {
-    auto inp = flat_inputs[i];
-    if (tensor_inputs[i]) {
-      requires_getindex = true;
-      mark_tensor_index();
-      for (auto l : tensor_inputs[i].levels) {
-        // std::cout << "Consider to add " << l << "\n";
-        if (!index_levels.contains(l)) {
-          index_levels.append(A, l);
-        }
-      }
-    } else if (Dim::check_exact(inp)) {
-      auto d = Dim::unchecked_wrap(inp);
-      // dimensions used once are just binding operations
-      if (1 == seen_dims_nuses[*seen_dims.index(d)]) {
-        flat_inputs[i] = no_slice;
-        result_levels.append(A, d);
-      } else {
-        requires_getindex = true;
-        flat_inputs[i] = mpy::handle();
-        tensor_inputs[i] = TensorInfo{
-            d->range(), Slice<DimEntry>(A, DimEntry(d)), false, TensorRef()};
-        if (!index_levels.contains(d)) {
-          index_levels.append(A, d);
-        }
-        mark_tensor_index();
-      }
-    } else {
-      if (inp.ptr() != no_slice.ptr()) {
-        requires_getindex = true;
-      }
-      if (!mpy::is_int(inp)) {
-        // note: actual positional indexes are accurately computed later
-        result_levels.append(A, -1);
-      }
-    }
-  }
-
-  // indexing dimensions appear in the tensor at the _first use of a tensor_ in
-  // the indexing. So insert the indexing leveles into the result klevels at
-  // this spot
-  if (tensor_insert_point != -1) {
-    result_levels.insert(
-        A,
-        result_levels.slice(tensor_insert_point, tensor_insert_point),
-        index_levels);
-  }
-
-  // std::cout << "flat inputs: " << flat_inputs << "\n";
-  // std::cout << "result_levels: " << result_levels << "\n";
-  // std::cout << "index_levels: " << index_levels << "\n";
-
-  // get all the tensors to be the right shape for indexing
-  if (requires_getindex) {
-    for (auto i : flat_inputs.enumerate()) {
-      if (tensor_inputs[i]) {
-        AT_ASSERT(!flat_inputs[i].ptr());
-        // std::cout << "tensor " << i << " " << tensor_inputs[i].levels <<
-        // "\n";
-        TensorRef t = tensor_inputs[i].tensor;
-        if (!tensor_inputs[i].has_device && device_holding_tensor) {
-          t = A.autorelease(t->to(device_holding_tensor->device()));
-        }
-        flat_inputs[i] = handle_from_tensor(
-            A, _match_levels(A, t, tensor_inputs[i].levels, index_levels));
-      }
-    }
-  }
-
-  // previously we didn't know how many positional dimensions there would be so
-  // we couldn't number them right so fill it in now.
-  auto seen_positionals = 0;
-  for (auto i : result_levels.reversed_enumerate()) {
-    if (result_levels[i].is_positional()) {
-      result_levels[i] = -(++seen_positionals);
-    }
-  }
-
-  return IndexingInfo{
-      false,
-      requires_getindex,
-      self_info.tensor,
-      flat_inputs,
-      result_levels,
-      self_info.has_device};
-}
-namespace {
-mpy::object __getitem__(Arena& A, mpy::handle self, mpy::handle index) {
-  maybeInitializeGlobals();
-  auto iinfo = getsetitem(A, self, index, has_dims(self));
-  if (iinfo.can_call_original) {
-    return mpy::object::checked_steal(
-        THPVariable_getitem(self.ptr(), index.ptr()));
-  }
-
-  return invoke_getitem(A, iinfo);
-}
-
-void __setitem__(
-    Arena& A,
-    mpy::handle self,
-    mpy::handle index,
-    mpy::handle rhs) {
-  maybeInitializeGlobals();
-  auto iinfo = getsetitem(A, self, index, has_dims(self) || has_dims(rhs));
-  if (iinfo.can_call_original) {
-    if (-1 == THPVariable_setitem(self.ptr(), index.ptr(), rhs.ptr())) {
-      throw mpy::exception_set();
-    }
-    return;
-  }
-
-  auto rhs_info = TensorInfo::create(A, rhs, false, false);
-  if (rhs_info) { // otherwise rhs can be a scalar...
-    for (auto l : rhs_info.levels) {
-      if (!iinfo.result_levels.contains(l)) {
-        if (l.is_positional()) {
-          mpy::raise_error(
-              DimensionBindError(),
-              "rhs contains too many dimensions (%d) compared to indexed value (%d)",
-              ndim_of_levels(iinfo.result_levels),
-              rhs_info.ndim());
-        } else {
-          auto tup = levels_to_tuple(iinfo.result_levels);
-          mpy::raise_error(
-              DimensionBindError(),
-              "rhs of setitem contains dimension %R which is not in the dimension on the left (%R)",
-              l.dim().ptr(),
-              tup.ptr());
-        }
-      }
-    }
-    auto rhs_matched =
-        _match_levels(A, rhs_info.tensor, rhs_info.levels, iinfo.result_levels);
-    rhs = handle_from_tensor(A, rhs_matched);
-  }
-  self = handle_from_tensor(A, iinfo.self);
-
-  if (iinfo.advanced_indexing) {
-    auto tup = slice_to_tuple(iinfo.flat_inputs);
-    if (-1 == THPVariable_setitem(self.ptr(), tup.ptr(), rhs.ptr())) {
-      throw mpy::exception_set();
-    }
-  } else {
-    torch_Tensor_copy_.call(self, rhs);
-  }
-}
-} // namespace
-
-PyObject* Tensor_getitem(PyObject* self, PyObject* index) {
-  Arena A;
-  PY_BEGIN
-  return __getitem__(A, self, index).release();
-  PY_END(nullptr);
-}
-
-int Tensor_setitem(PyObject* self, PyObject* index, PyObject* value) {
-  Arena A;
-  PY_BEGIN
-  __setitem__(A, self, index, value);
-  return 0;
-  PY_END(-1);
-}
-
-namespace {
-PyObject* py___getitem__(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  AT_ASSERT(nargs == 2);
-  return __getitem__(A, args[0], args[1]).release();
-  PY_END(nullptr)
-}
-
-PyObject* py___setitem__(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  AT_ASSERT(nargs == 3);
-  __setitem__(A, args[0], args[1], args[2]);
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-PyObject* py_index(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  mpy::vector_args va(args, nargs, kwnames);
-  mpy::handle self, dims, indices;
-  va.parse("index", {"self", "dims", "indices"}, {&self, &dims, &indices}, 3);
-  return index(A, self, dims, indices).release();
-  PY_END(nullptr)
-}
-
-PyObject* py_stack(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  mpy::vector_args va(args, nargs, kwnames);
-  mpy::handle tensors, new_dim, dim;
-  va.parse(
-      "stack", {"tensors", "new_dim", "dim"}, {&tensors, &new_dim, &dim}, 2);
-
-  Slice<DimEntry> result_levels;
-  Slice<TensorInfo> infos;
-  mpy::sequence_view sv(tensors);
-  auto new_dim_d = Dim::wrap(new_dim);
-  for (auto i : sv.enumerate()) {
-    infos.append(A, TensorInfo::create(A, A.autorelease(sv[i]), false));
-    for (auto l : infos.back().levels) {
-      if (!result_levels.contains(l)) {
-        result_levels.append(A, l);
-      }
-    }
-  }
-  new_dim_d->set_size(infos.size());
-  std::vector<at::Tensor> inputs;
-  inputs.reserve(infos.size());
-  for (auto in : infos) {
-    inputs.emplace_back(*_match_levels(A, in.tensor, in.levels, result_levels));
-  }
-  auto ndim = ndim_of_levels(result_levels);
-  int64_t rawdim = 0;
-  if (dim.ptr()) {
-    auto d = _wrap_dim(dim, ndim, false);
-    auto idx = result_levels.index(d);
-    if (!idx) {
-      mpy::raise_error(
-          PyExc_TypeError, "Dimension %R does not exist in inputs", dim.ptr());
-    }
-    rawdim = *idx;
-  }
-  auto result = at::stack(inputs, rawdim);
-  result_levels.insert(A, rawdim, new_dim_d);
-  return Tensor::from_positional(A, std::move(result), result_levels, true)
-      .release();
-  PY_END(nullptr)
-}
-
-PyObject* py_split(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  maybeInitializeGlobals();
-  mpy::vector_args va(args, nargs, kwnames);
-  mpy::handle self, split_size_or_sections, dim;
-  va.parse(
-      "split",
-      {"self", "split_size_or_sections", "dim"},
-      {&self, &split_size_or_sections, &dim},
-      2);
-  bool dim_is_object = dim.ptr() && Dim::check_exact(dim);
-  Slice<mpy::handle> sizes;
-
-  bool all_dims = true;
-  bool all_ints = true;
-
-  if (!mpy::is_int(split_size_or_sections)) {
-    mpy::sequence_view sv(split_size_or_sections);
-    for (auto i : sv.enumerate()) {
-      sizes.append(A, A.autorelease(sv[i]));
-      if (Dim::check_exact(sizes.back())) {
-        all_ints = false;
-      } else {
-        all_dims = false;
-      }
-    }
-  }
-  if (all_ints) {
-    if (dim_is_object) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "when dim is specified as a Dim object, split sizes must also be dimensions.");
-    }
-    // call original split (if self has dimensions this will use torch function
-    // to do the split)
-    return torch_Tensor_split
-        .call_vector(mpy::vector_args(args, nargs, kwnames))
-        .release();
-  }
-  if (!all_dims) {
-    mpy::raise_error(
-        PyExc_TypeError, "split list must be ints or dims but got a mix");
-  }
-
-  auto self_info = TensorInfo::create(A, self, false);
-  auto ndim = self_info.ndim();
-  if (!dim_is_object && ndim == 0) {
-    mpy::raise_error(
-        PyExc_TypeError, "split expects at least a 1-dimension tensor");
-  }
-  DimEntry dim_l = dim.ptr() ? _wrap_dim(dim, ndim, false) : -ndim;
-
-  auto idx = self_info.levels.index(dim_l);
-  if (!idx) {
-    if (!dim.ptr()) {
-      dim = A.autorelease(mpy::from_int(0));
-    }
-    mpy::raise_error(
-        PyExc_TypeError, "tensor does not contain dimension %R", dim.ptr());
-  }
-  Slice<int64_t> indices;
-
-  int64_t total_size = 0;
-  Slice<int64_t> unbound;
-  for (auto i : sizes.enumerate()) {
-    auto d = Dim::unchecked_wrap(sizes[i]);
-    if (d->is_bound()) {
-      indices.append(A, d->size());
-      total_size += indices.back();
-    } else {
-      indices.append(A, 0);
-      unbound.append(A, i);
-    }
-  }
-  auto tensor_size = self_info.tensor->sizes()[*idx];
-
-  if (unbound.size()) {
-    if (total_size > tensor_size) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "sizes of target dimensions add up to more (%d) than source dim (%d)",
-          int(total_size),
-          int(tensor_size));
-    }
-    auto remaining_size = tensor_size - total_size;
-    auto chunk_size = (remaining_size + unbound.size() - 1) / unbound.size();
-    for (auto u : unbound) {
-      auto sz = std::min(chunk_size, remaining_size);
-      Dim::unchecked_wrap(sizes[u])->set_size(sz);
-      indices[u] = sz;
-      remaining_size -= sz;
-    }
-  } else if (tensor_size != total_size) {
-    mpy::raise_error(
-        PyExc_TypeError,
-        "sum of sizes of target dimensions (%d) do not match the than source dim (%d)",
-        int(total_size),
-        int(tensor_size));
-  }
-
-  auto result_tensors = self_info.tensor->split_with_sizes(
-      at::IntArrayRef(indices.begin(), indices.end()), *idx);
-  mpy::tuple result(result_tensors.size());
-  Slice<DimEntry> new_levels;
-  new_levels.extend(A, self_info.levels);
-  for (auto i : sizes.enumerate()) {
-    new_levels[*idx] = Dim::unchecked_wrap(sizes[i]);
-    result.set(
-        i,
-        Tensor::from_positional(
-            A, std::move(result_tensors[i]), new_levels, true));
-  }
-
-  return result.release();
-
-  PY_END(nullptr)
-}
-
-Slice<DimEntry> _wrap_dims(Arena& A, mpy::handle d, size_t N, bool keepdim) {
-  auto de = _wrap_dim(d, N, keepdim);
-  Slice<DimEntry> r;
-  if (!de.is_none()) {
-    r.append(A, de);
-  } else {
-    mpy::sequence_view sq(d);
-    for (auto i : sq.enumerate()) {
-      r.append(A, _wrap_dim(A.autorelease(sq[i]), N, keepdim));
-    }
-  }
-  return r;
-}
-
-struct WrappedOperator : public mpy::base<WrappedOperator> {
-  mpy::object orig;
-  PyMethodDef method_def;
-  mpy::object name, doc;
-
-  bool is_pointwise = false;
-  int64_t dim_offset = 0;
-  int64_t keepdim_offset = 1;
-  std::string dim_name;
-  bool single_dim = false;
-  bool reduce = true;
-
-  static PyTypeObject Type;
-
-  void init(
-      mpy::object orig_,
-      PyCFunction wrapper_implementation,
-      std::string dim_name_ = "") {
-    orig = std::move(orig_);
-    method_def.ml_meth = wrapper_implementation;
-    name = orig.attr("__name__");
-    doc = orig.attr("__doc__");
-    dim_name = std::move(dim_name_);
-    if (!mpy::is_none(doc) && !dim_name.empty()) {
-      doc = mpy::unicode_from_format(
-          "%S\nArgument '%s' can be either an integer or a torchdim.Dim object.\n",
-          doc.ptr(),
-          dim_name.c_str());
-    }
-    method_def.ml_name = mpy::is_none(name) ? "" : PyUnicode_AsUTF8(name.ptr());
-    method_def.ml_doc = mpy::is_none(doc) ? "" : PyUnicode_AsUTF8(doc.ptr());
-    method_def.ml_flags = METH_FASTCALL | METH_KEYWORDS;
-  }
-
-  mpy::object function() {
-    return mpy::object::checked_steal(PyCFunction_New(&method_def, ptr()));
-  }
-};
-} // namespace
-
-PyTypeObject WrappedOperator::Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    "_C.WrappedOperator", /* tp_name */
-    sizeof(WrappedOperator), /* tp_basicsize */
-    0, /* tp_itemsize */
-    WrappedOperator::dealloc_stub, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    0, /* tp_getattr */
-    0, /* tp_setattr */
-    0, /* tp_as_async */
-    0, /* tp_repr */
-    0, /* tp_as_number */
-    0, /* tp_as_sequence */
-    0, /* tp_as_mapping */
-    0, /* tp_hash */
-    0, /* tp_call */
-    0, /* tp_str */
-    0, /* tp_getattro */
-    0, /* tp_setattro */
-    0, /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT, /* tp_flags */
-    "Wrapped Object Holder", /* tp_doc */
-    0, /* tp_traverse */
-    0, /* tp_clear */
-    0, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    0, /* tp_iter */
-    0, /* tp_iternext */
-    0, /* tp_methods */
-    0, /* tp_members */
-    0, /* tp_getset */
-    0, /* tp_base */
-    0, /* tp_dict */
-    0, /* tp_descr_get */
-    0, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    0, /* tp_init */
-    0, /* tp_alloc */
-    WrappedOperator::new_stub, /* tp_new */
-};
-
-namespace {
-PyObject* patched_dim_method(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  auto self = WrappedOperator::unchecked_wrap(self_);
-  PY_BEGIN
-
-  mpy::vector_args va(args, nargs, kwnames);
-
-  auto _getarg = [&](const char* name, int64_t offset_) -> mpy::handle {
-    auto offset = offset_ + 1; // do not include self
-    auto idx = va.index(name, offset);
-    return idx == -1 ? mpy::handle() : va[idx];
-  };
-  Slice<mpy::handle> patched_args;
-  patched_args.extend(A, va.begin(), va.end());
-  auto _patcharg = [&](const char* name, int64_t offset_, mpy::handle value) {
-    auto offset = offset_ + 1; // do not include self
-    auto idx = va.index(name, offset);
-    if (idx == -1) {
-      mpy::raise_error(PyExc_ValueError, "Missing argument %s", name);
-    }
-    patched_args[idx] = value;
-  };
-
-  auto dim = _getarg(self->dim_name.c_str(), self->dim_offset);
-  if (!dim.ptr()) {
-    auto info = TensorInfo::create(A, args[0], true);
-    EnableAllLayers l(A, info.levels);
-    l.inplace_update_layers(info.batchedtensor, info.levels);
-    patched_args[0] = handle_from_tensor(A, info.batchedtensor);
-    auto r = self->orig.call_vector(patched_args.begin(), nargs, kwnames);
-    return l.from_batched(A, THPVariable_Unpack(r.ptr()), info.has_device)
-        .release();
-  }
-
-  auto info = TensorInfo::create(A, args[0]);
-  auto keepdim = false;
-  if (self->reduce) {
-    auto py_keepdim = _getarg("keepdim", self->keepdim_offset);
-    if (py_keepdim.ptr()) {
-      keepdim = mpy::to_bool(py_keepdim);
-    }
-  }
-
-  auto ndim = info.ndim();
-  auto dims = _wrap_dims(A, dim, ndim, keepdim);
-  Slice<int64_t> dim_indices;
-  auto seen = A.allocate<bool>(info.levels.size());
-  std::fill(seen, seen + info.levels.size(), false);
-
-  for (auto d : dims) {
-    auto midx = info.levels.index(d);
-    if (!midx) {
-      auto tup = levels_to_tuple(info.levels);
-      mpy::raise_error(
-          PyExc_ValueError,
-          "Tensor with dimensions %R does not contain one of %R\n",
-          tup.ptr(),
-          dim.ptr());
-    }
-    seen[*midx] = true;
-    dim_indices.append(A, *midx);
-  }
-  Slice<DimEntry> new_levels;
-  if (self->reduce && !keepdim) {
-    for (auto i : info.levels.enumerate()) {
-      if (!seen[i]) {
-        new_levels.append(A, info.levels[i]);
-      }
-    }
-  } else {
-    new_levels = info.levels;
-  }
-  mpy::object py_indices;
-  if (dim_indices.size() == 1) {
-    py_indices = mpy::from_int(dim_indices[0]);
-  } else {
-    mpy::tuple tup(dim_indices.size());
-    for (auto i : dim_indices.enumerate()) {
-      tup.set(i, mpy::from_int(dim_indices[i]));
-    }
-    py_indices = std::move(tup);
-  }
-  _patcharg(self->dim_name.c_str(), self->dim_offset, py_indices);
-  patched_args[0] = handle_from_tensor(A, info.tensor);
-  auto r = self->orig.call_vector(patched_args.begin(), nargs, kwnames);
-  auto wrap = [&](mpy::handle h) {
-    if (THPVariable_Check(h.ptr())) {
-      return A.autorelease(Tensor::from_positional(
-          A, THPVariable_Unpack(h.ptr()), new_levels, info.has_device));
-    }
-    return h;
-  };
-  return tree_map(A, wrap, r).release();
-  PY_END(nullptr)
-}
-
-PyObject* _wrap(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-
-#define ARGS(_)                                             \
-  _(mpy::handle, orig)                                      \
-  _(mpy::handle, dim_offset) _(mpy::handle, keepdim_offset) \
-      _(mpy::handle, dim_name) _(mpy::handle, single_dim)   \
-          _(mpy::handle, reduce)
-  MPY_PARSE_ARGS_KWNAMES("O|OOOOO", ARGS)
-
-  std::string dim_name_str;
-  if (dim_name.ptr()) {
-    dim_name_str = PyUnicode_AsUTF8(dim_name.ptr());
-  } else {
-    dim_name_str = "dim";
-  }
-  auto info = WrappedOperator::create(
-      mpy::object::borrow(orig),
-      (PyCFunction)(void*)patched_dim_method,
-      std::move(dim_name_str));
-  if (dim_offset.ptr()) {
-    info->dim_offset = mpy::to_int(dim_offset);
-  }
-  if (keepdim_offset.ptr()) {
-    info->keepdim_offset = mpy::to_int(keepdim_offset);
-  }
-
-  if (single_dim.ptr()) {
-    info->single_dim = mpy::to_bool(single_dim);
-  }
-  if (reduce.ptr()) {
-    info->reduce = mpy::to_bool(reduce);
-  }
-  return info->function().release();
-#undef ARGS
-
-  PY_END(nullptr)
-}
-
-PyObject* call_torch_function(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  Arena A;
-  maybeInitializeGlobals();
-  auto info = WrappedOperator::unchecked_wrap(self);
-  return __torch_function__(
-             A,
-             info->orig,
-             mpy::vector_args(args, nargs, kwnames),
-             info->is_pointwise)
-      .release();
-  PY_END(nullptr)
-}
-
-PyObject* _wrap_method(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  AT_ASSERT(nargs == 2);
-  // XXX - ignore python function wrapped, we will call torch function directly
-  mpy::handle orig = args[0];
-  if (!pointwise.ptr()) {
-    auto dim = mpy::import("functorch.dim");
-    pointwise = dim.attr("pointwise");
-  }
-  auto info = WrappedOperator::create(
-      mpy::object::borrow(orig), (PyCFunction)(void*)call_torch_function);
-  info->is_pointwise = pointwise.contains(orig);
-  return PyInstanceMethod_New(info->function().release());
-  PY_END(nullptr);
-}
-
-PyObject* Tensor_sum(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  maybeInitializeGlobals();
-  mpy::vector_args va(args, nargs, kwnames);
-  auto self_ = Tensor::unchecked_wrap(args[0]);
-  auto d = self_->delayed();
-  if (!d) {
-    return _Tensor_sum.call_vector(va).release();
-  }
-  mpy::handle self, dim, keepdim, dtype;
-  va.parse(
-      "sum",
-      {"self", "dim", "keepdim", "dtype"},
-      {&self, &dim, &keepdim, &dtype},
-      1,
-      1);
-
-  if (dtype.ptr() || (keepdim.ptr() && mpy::to_bool(keepdim))) {
-    // std::cout << "SKIPPING fusion because dtype or keepdim=True specified\n";
-    return _Tensor_sum.call_vector(va).release();
-  }
-  auto levels = self_->levels();
-
-  auto N = ndim_of_levels(levels);
-  auto reduced_dims = _wrap_dims(A, dim, N, false);
-
-  return dot(A,
-             TensorInfo::create(A, d->args[0], false),
-             TensorInfo::create(A, d->args[1], false),
-             reduced_dims)
-      .release();
-  PY_END(nullptr)
-}
-
-PyObject* _parse_test(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  maybeInitializeGlobals();
-
-  int required = mpy::to_int(args[0]);
-  int kwonly = mpy::to_int(args[1]);
-
-  mpy::vector_args va(args + 2, nargs - 2, kwnames);
-
-  mpy::handle a, b, c, d;
-  va.parse(
-      "_parse_test", {"a", "b", "c", "d"}, {&a, &b, &c, &d}, required, kwonly);
-  mpy::tuple r(4);
-  r.set(0, mpy::object::borrow(a.ptr() ? a : Py_None));
-  r.set(1, mpy::object::borrow(b.ptr() ? b : Py_None));
-  r.set(2, mpy::object::borrow(c.ptr() ? c : Py_None));
-  r.set(3, mpy::object::borrow(d.ptr() ? d : Py_None));
-  return r.release();
-
-  PY_END(nullptr)
-}
-
-PyObject* _set_pointwise_optimize(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  mpy::handle value;
-  mpy::vector_args va(args, nargs, kwnames);
-  va.parse("_set_pointwise_optimization", {"value"}, {&value}, 1);
-  pointwise_optimize = mpy::to_bool(value);
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-PyObject* _patch_tensor_class(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-
-  auto torch = mpy::import("torch");
-  auto py_TensorBase = torch.attr("_C").attr("TensorBase");
-  replaceMappingIfMatches(py_TensorBase);
-
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-const char* dims_doc = R"""(
-dims(n=None, sizes=None) -> torchdim.Dim or Tuple[torchdim.Dim, ...]
-
-Creates and returns one or more Dim objects.
-
-Arg:
-    n (int, optional): The number of dimensions to create. Can be omitted if sizes is specified.
-    sizes (List[Optional[int]], optional): A list the same size as the number of dimensions to be
-      created, specifying each dimensions size, or None to leave the size unset.
-
-Example::
-    >>> batch, channel, width, height = dims(4)
-    >>> batch, channel, width, height = dims(sizes=[None, 3, 224, 224])
-)""";
-
-PyMethodDef methods[] = {
-    {"dims",
-     (PyCFunction)(void*)_dims<create_dim>,
-     METH_FASTCALL | METH_KEYWORDS,
-     dims_doc},
-    {"dimlists",
-     (PyCFunction)(void*)_dims<create_dimlist>,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_test_c", (PyCFunction)(void*)test_c, METH_FASTCALL | METH_KEYWORDS},
-    {"_wrap_method",
-     (PyCFunction)(void*)_wrap_method,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"Tensor_from_positional",
-     (PyCFunction)(void*)py_Tensor_from_positional,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"__torch_function__",
-     (PyCFunction)(void*)py___torch_function__,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"tree_flatten",
-     (PyCFunction)(void*)py_tree_flatten,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"order", (PyCFunction)(void*)order, METH_FASTCALL | METH_KEYWORDS},
-    {"index", (PyCFunction)(void*)py_index, METH_FASTCALL | METH_KEYWORDS},
-    {"stack", (PyCFunction)(void*)py_stack, METH_FASTCALL | METH_KEYWORDS},
-    {"split", (PyCFunction)(void*)py_split, METH_FASTCALL | METH_KEYWORDS},
-    {"expand", (PyCFunction)(void*)expand, METH_FASTCALL | METH_KEYWORDS},
-    {"__getitem__",
-     (PyCFunction)(void*)py___getitem__,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"__setitem__",
-     (PyCFunction)(void*)py___setitem__,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_wrap", (PyCFunction)(void*)_wrap, METH_FASTCALL | METH_KEYWORDS},
-    {"Tensor_sum",
-     (PyCFunction)(void*)Tensor_sum,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_parse_test",
-     (PyCFunction)(void*)_parse_test,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_set_pointwise_optimize",
-     (PyCFunction)(void*)_set_pointwise_optimize,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_patch_tensor_class",
-     (PyCFunction)(void*)_patch_tensor_class,
-     METH_FASTCALL | METH_KEYWORDS},
-    {NULL, NULL, 0, NULL} /* Sentinel */
-};
-
-struct PyModuleDef module_def = {
-    PyModuleDef_HEAD_INIT,
-    "_C", /* name of module */
-    NULL, /* module documentation, may be NULL */
-    -1, /* size of per-interpreter state of the module,
-           or -1 if the module keeps state in global variables. */
-    methods};
-} // namespace
-
-PyObject* Dim_init() {
-  Arena A;
-  try {
-    mpy::object mod = mpy::object::checked_steal(PyModule_Create(&module_def));
-    Dim::ready(mod, "Dim");
-    DimList::ready(mod, "DimList");
-    Tensor::ready(mod, "Tensor");
-    WrappedOperator::ready(mod, "_WrappedOperator");
-    Py_INCREF(&PyInstanceMethod_Type);
-    PyModule_AddObject(
-        mod.ptr(), "_instancemethod", (PyObject*)&PyInstanceMethod_Type);
-
-    initializeGlobals(A);
-    return mod.release();
-  } catch (mpy::exception_set& err) {
-    return nullptr;
-  }
-}
-
-#endif
diff --git a/functorch/csrc/dim/dim.h b/functorch/csrc/dim/dim.h
deleted file mode 100644
index 627caa729fc28..0000000000000
--- a/functorch/csrc/dim/dim.h
+++ /dev/null
@@ -1,8 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-#pragma once
-#include <Python.h>
-PyObject* Dim_init();
diff --git a/functorch/csrc/dim/dim_opcode.c b/functorch/csrc/dim/dim_opcode.c
deleted file mode 100644
index 1b5d067734450..0000000000000
--- a/functorch/csrc/dim/dim_opcode.c
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <torch/csrc/utils/python_compat.h>
-#if defined(_WIN32) && IS_PYTHON_3_11_PLUS
-#define Py_BUILD_CORE
-#define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt, _PyOpcode_Caches
-
-#if IS_PYTHON_3_13_PLUS
-#include <cpython/code.h> // To get PyUnstable_Code_GetFirstFree
-#define NEED_OPCODE_METADATA
-#include "internal/pycore_opcode_metadata.h"
-#undef NEED_OPCODE_METADATA
-#else
-#include "internal/pycore_opcode.h"
-#endif
-
-#undef NEED_OPCODE_TABLES
-#undef Py_BUILD_CORE
-#endif
diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h
deleted file mode 100644
index ceced399b40d2..0000000000000
--- a/functorch/csrc/dim/minpybind.h
+++ /dev/null
@@ -1,692 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <utility>
-#include <ostream>
-#include <memory>
-
-#define PY_BEGIN try {
-#define PY_END(v) } catch(mpy::exception_set & err) { return (v); }
-
-#if PY_VERSION_HEX < 0x03080000
-    #define PY_VECTORCALL _PyObject_FastCallKeywords
-#else
-    #define PY_VECTORCALL _PyObject_Vectorcall
-#endif
-
-struct irange {
- public:
-    irange(int64_t end)
-    : irange(0, end, 1) {}
-    irange(int64_t begin, int64_t end, int64_t step = 1)
-    : begin_(begin), end_(end), step_(step) {}
-    int64_t operator*() const {
-        return begin_;
-    }
-    irange& operator++() {
-        begin_ += step_;
-        return *this;
-    }
-    bool operator!=(const irange& other) {
-        return begin_ != other.begin_;
-    }
-    irange begin() {
-        return *this;
-    }
-    irange end() {
-        return irange {end_, end_, step_};
-    }
- private:
-    int64_t begin_;
-    int64_t end_;
-    int64_t step_;
-};
-
-namespace mpy {
-
-struct exception_set {
-};
-
-struct object;
-struct vector_args;
-
-struct handle {
-    handle(PyObject* ptr)
-    : ptr_(ptr) {}
-    handle() = default;
-
-
-    PyObject* ptr() const {
-        return ptr_;
-    }
-    object attr(const char* key);
-    bool hasattr(const char* key);
-    handle type() const {
-        return (PyObject*) Py_TYPE(ptr());
-    }
-
-    template<typename... Args>
-    object call(Args&&... args);
-    object call_object(mpy::handle args);
-    object call_object(mpy::handle args, mpy::handle kwargs);
-    object call_vector(mpy::handle* begin, Py_ssize_t nargs, mpy::handle kwnames);
-    object call_vector(vector_args args);
-    bool operator==(handle rhs) {
-        return ptr_ == rhs.ptr_;
-    }
-
-    static handle checked(PyObject* ptr) {
-        if (!ptr) {
-            throw exception_set();
-        }
-        return ptr;
-    }
-
-protected:
-    PyObject* ptr_ = nullptr;
-};
-
-
-template<typename T>
-struct obj;
-
-template<typename T>
-struct hdl : public handle {
-    T* ptr() {
-        return  (T*) handle::ptr();
-    }
-    T* operator->() {
-        return ptr();
-    }
-    hdl(T* ptr)
-    : hdl((PyObject*) ptr) {}
-    hdl(const obj<T>& o)
-    : hdl(o.ptr()) {}
-private:
-    hdl(handle h) : handle(h) {}
-};
-
-struct object : public handle {
-    object() = default;
-    object(const object& other)
-    : handle(other.ptr_) {
-        Py_XINCREF(ptr_);
-    }
-    object(object&& other) noexcept
-    : handle(other.ptr_) {
-        other.ptr_ = nullptr;
-    }
-    object& operator=(const object& other) {
-        return *this = object(other);
-    }
-    object& operator=(object&& other) noexcept {
-        PyObject* tmp = ptr_;
-        ptr_ = other.ptr_;
-        other.ptr_ = tmp;
-        return *this;
-    }
-    ~object() {
-        Py_XDECREF(ptr_);
-    }
-    static object steal(handle o) {
-        return object(o.ptr());
-    }
-    static object checked_steal(handle o) {
-        if (!o.ptr()) {
-            throw exception_set();
-        }
-        return steal(o);
-    }
-    static object borrow(handle o) {
-        Py_XINCREF(o.ptr());
-        return steal(o);
-    }
-    PyObject* release() {
-        auto tmp = ptr_;
-        ptr_ = nullptr;
-        return tmp;
-    }
-protected:
-    explicit object(PyObject* ptr)
-    : handle(ptr) {}
-};
-
-template<typename T>
-struct obj : public object {
-    obj() = default;
-    obj(const obj& other)
-    : object(other.ptr_) {
-        Py_XINCREF(ptr_);
-    }
-    obj(obj&& other) noexcept
-    : object(other.ptr_) {
-        other.ptr_ = nullptr;
-    }
-    obj& operator=(const obj& other) {
-        return *this = obj(other);
-    }
-    obj& operator=(obj&& other) noexcept {
-        PyObject* tmp = ptr_;
-        ptr_ = other.ptr_;
-        other.ptr_ = tmp;
-        return *this;
-    }
-    static obj steal(hdl<T> o) {
-        return obj(o.ptr());
-    }
-    static obj checked_steal(hdl<T> o) {
-        if (!o.ptr()) {
-            throw exception_set();
-        }
-        return steal(o);
-    }
-    static obj borrow(hdl<T> o) {
-        Py_XINCREF(o.ptr());
-        return steal(o);
-    }
-    T* ptr() const {
-        return (T*) object::ptr();
-    }
-    T* operator->() {
-        return ptr();
-    }
-protected:
-    explicit obj(T* ptr)
-    : object((PyObject*)ptr) {}
-};
-
-
-static bool isinstance(handle h, handle c) {
-    return PyObject_IsInstance(h.ptr(), c.ptr());
-}
-
-[[ noreturn ]] inline void raise_error(handle exception, const char *format, ...) {
-    va_list args;
-    va_start(args, format);
-    PyErr_FormatV(exception.ptr(), format, args);
-    va_end(args);
-    throw exception_set();
-}
-
-template<typename T>
-struct base {
-    PyObject_HEAD
-    PyObject* ptr() const {
-        return (PyObject*) this;
-    }
-    static obj<T> alloc(PyTypeObject* type = nullptr) {
-        if (!type) {
-            type = &T::Type;
-        }
-        auto self = (T*) type->tp_alloc(type, 0);
-        if (!self) {
-            throw mpy::exception_set();
-        }
-        new (self) T;
-        return obj<T>::steal(self);
-    }
-    template<typename ... Args>
-    static obj<T> create(Args ... args) {
-        auto self = alloc();
-        self->init(std::forward<Args>(args)...);
-        return self;
-    }
-    static bool check(handle v) {
-        return isinstance(v, (PyObject*)&T::Type);
-    }
-
-    static hdl<T> unchecked_wrap(handle self_) {
-        return hdl<T>((T*)self_.ptr());
-    }
-    static hdl<T> wrap(handle self_) {
-        if (!check(self_)) {
-            raise_error(PyExc_ValueError, "not an instance of %S", &T::Type);
-        }
-        return unchecked_wrap(self_);
-    }
-
-    static obj<T> unchecked_wrap(object self_) {
-        return obj<T>::steal(unchecked_wrap(self_.release()));
-    }
-    static obj<T> wrap(object self_) {
-        return obj<T>::steal(wrap(self_.release()));
-    }
-
-    static PyObject* new_stub(PyTypeObject *type, PyObject *args, PyObject *kwds) {
-        PY_BEGIN
-        return (PyObject*) alloc(type).release();
-        PY_END(nullptr)
-    }
-    static void dealloc_stub(PyObject *self) {
-        ((T*)self)->~T();
-        Py_TYPE(self)->tp_free(self);
-    }
-    static void ready(mpy::handle mod, const char* name) {
-        if (PyType_Ready(&T::Type)) {
-            throw exception_set();
-        }
-        if(PyModule_AddObject(mod.ptr(), name, (PyObject*) &T::Type) < 0) {
-            throw exception_set();
-        }
-    }
-};
-
-inline object handle::attr(const char* key) {
-    return object::checked_steal(PyObject_GetAttrString(ptr(), key));
-}
-
-inline bool handle::hasattr(const char* key) {
-    return PyObject_HasAttrString(ptr(), key);
-}
-
-inline object import(const char* module) {
-    return object::checked_steal(PyImport_ImportModule(module));
-}
-
-template<typename... Args>
-inline object handle::call(Args&&... args) {
-    return object::checked_steal(PyObject_CallFunctionObjArgs(ptr_, args.ptr()..., nullptr));
-}
-
-inline object handle::call_object(mpy::handle args) {
-    return object::checked_steal(PyObject_CallObject(ptr(), args.ptr()));
-}
-
-
-inline object handle::call_object(mpy::handle args, mpy::handle kwargs) {
-    return object::checked_steal(PyObject_Call(ptr(), args.ptr(), kwargs.ptr()));
-}
-
-inline object handle::call_vector(mpy::handle* begin, Py_ssize_t nargs, mpy::handle kwnames) {
-    return object::checked_steal(PY_VECTORCALL(ptr(), (PyObject*const*) begin, nargs, kwnames.ptr()));
-}
-
-struct tuple : public object {
-    void set(int i, object v) {
-        PyTuple_SET_ITEM(ptr_, i, v.release());
-    }
-    tuple(int size)
-    : object(checked_steal(PyTuple_New(size))) {}
-};
-
-struct list : public object {
-    void set(int i, object v) {
-        PyList_SET_ITEM(ptr_, i, v.release());
-    }
-    list(int size)
-    : object(checked_steal(PyList_New(size))) {}
-};
-
-namespace{
-mpy::object unicode_from_format(const char* format, ...) {
-    va_list args;
-    va_start(args, format);
-    auto r = PyUnicode_FromFormatV(format, args);
-    va_end(args);
-    return mpy::object::checked_steal(r);
-}
-mpy::object unicode_from_string(const char * str) {
-    return mpy::object::checked_steal(PyUnicode_FromString(str));
-}
-
-mpy::object from_int(Py_ssize_t s) {
-    return mpy::object::checked_steal(PyLong_FromSsize_t(s));
-}
-mpy::object from_bool(bool b) {
-    return mpy::object::borrow(b ? Py_True : Py_False);
-}
-
-bool is_sequence(handle h) {
-    return PySequence_Check(h.ptr());
-}
-}
-
-struct sequence_view : public handle {
-    sequence_view(handle h)
-    : handle(h) {}
-    Py_ssize_t size() const {
-        auto r = PySequence_Size(ptr());
-        if (r == -1 && PyErr_Occurred()) {
-            throw mpy::exception_set();
-        }
-        return r;
-    }
-    irange enumerate() const {
-        return irange(size());
-    }
-    static sequence_view wrap(handle h) {
-        if (!is_sequence(h)) {
-            raise_error(PyExc_ValueError, "expected a sequence");
-        }
-        return sequence_view(h);
-    }
-    mpy::object operator[](Py_ssize_t i) const {
-        return mpy::object::checked_steal(PySequence_GetItem(ptr(), i));
-    }
-};
-
-namespace {
-mpy::object repr(handle h) {
-    return mpy::object::checked_steal(PyObject_Repr(h.ptr()));
-}
-
-mpy::object str(handle h) {
-    return mpy::object::checked_steal(PyObject_Str(h.ptr()));
-}
-
-
-bool is_int(handle h) {
-    return PyLong_Check(h.ptr());
-}
-
-bool is_none(handle h) {
-    return h.ptr() == Py_None;
-}
-
-Py_ssize_t to_int(handle h) {
-    Py_ssize_t r = PyLong_AsSsize_t(h.ptr());
-    if (r == -1 && PyErr_Occurred()) {
-        throw mpy::exception_set();
-    }
-    return r;
-}
-
-bool to_bool(handle h) {
-    return PyObject_IsTrue(h.ptr()) != 0;
-}
-}
-
-struct slice_view {
-    slice_view(handle h, Py_ssize_t size)  {
-        if(PySlice_Unpack(h.ptr(), &start, &stop, &step) == -1) {
-            throw mpy::exception_set();
-        }
-        slicelength = PySlice_AdjustIndices(size, &start, &stop, step);
-    }
-    Py_ssize_t start, stop, step, slicelength;
-};
-
-static bool is_slice(handle h) {
-    return PySlice_Check(h.ptr());
-}
-
-inline std::ostream& operator<<(std::ostream& ss, handle h) {
-    ss << PyUnicode_AsUTF8(str(h).ptr());
-    return ss;
-}
-
-struct tuple_view : public handle {
-    tuple_view() = default;
-    tuple_view(handle h) : handle(h) {}
-
-    Py_ssize_t size() const {
-        return PyTuple_GET_SIZE(ptr());
-    }
-
-    irange enumerate() const {
-        return irange(size());
-    }
-
-    handle operator[](Py_ssize_t i) {
-        return PyTuple_GET_ITEM(ptr(), i);
-    }
-
-    static bool check(handle h) {
-        return PyTuple_Check(h.ptr());
-    }
-};
-
-struct list_view : public handle {
-    list_view() = default;
-    list_view(handle h) : handle(h) {}
-    Py_ssize_t size() const {
-        return PyList_GET_SIZE(ptr());
-    }
-
-    irange enumerate() const {
-        return irange(size());
-    }
-
-    handle operator[](Py_ssize_t i) {
-        return PyList_GET_ITEM(ptr(), i);
-    }
-
-    static bool check(handle h) {
-        return PyList_Check(h.ptr());
-    }
-};
-
-struct dict_view : public handle {
-    dict_view() = default;
-    dict_view(handle h) : handle(h) {}
-    object keys() const {
-        return mpy::object::checked_steal(PyDict_Keys(ptr()));
-    }
-    object values() const {
-        return mpy::object::checked_steal(PyDict_Values(ptr()));
-    }
-    object items() const {
-        return mpy::object::checked_steal(PyDict_Items(ptr()));
-    }
-    bool contains(handle k) const {
-        return PyDict_Contains(ptr(), k.ptr());
-    }
-    handle operator[](handle k) {
-        return mpy::handle::checked(PyDict_GetItem(ptr(), k.ptr()));
-    }
-    static bool check(handle h) {
-        return PyDict_Check(h.ptr());
-    }
-    bool next(Py_ssize_t* pos, mpy::handle* key, mpy::handle* value) {
-        PyObject *k = nullptr, *v = nullptr;
-        auto r = PyDict_Next(ptr(), pos, &k, &v);
-        *key = k;
-        *value = v;
-        return r;
-    }
-    void set(handle k, handle v) {
-        if (-1 == PyDict_SetItem(ptr(), k.ptr(), v.ptr())) {
-            throw exception_set();
-        }
-    }
-};
-
-
-struct kwnames_view : public handle {
-    kwnames_view() = default;
-    kwnames_view(handle h) : handle(h) {}
-
-    Py_ssize_t size() const {
-        return PyTuple_GET_SIZE(ptr());
-    }
-
-    irange enumerate() const {
-        return irange(size());
-    }
-
-    const char* operator[](Py_ssize_t i) const {
-        PyObject* obj = PyTuple_GET_ITEM(ptr(), i);
-        return PyUnicode_AsUTF8(obj);
-    }
-
-    static bool check(handle h) {
-        return PyTuple_Check(h.ptr());
-    }
-};
-
-inline mpy::object funcname(mpy::handle func) {
-    if (func.hasattr("__name__")) {
-        return func.attr("__name__");
-    } else {
-        return mpy::str(func);
-    }
-}
-
-struct vector_args {
-    vector_args(PyObject *const *a,
-                      Py_ssize_t n,
-                      PyObject *k)
-    : vector_args((mpy::handle*)a, n, k) {}
-    vector_args(mpy::handle* a,
-                    Py_ssize_t n,
-                    mpy::handle k)
-    : args((mpy::handle*)a), nargs(n), kwnames(k) {}
-    mpy::handle* args;
-    Py_ssize_t nargs;
-    kwnames_view kwnames;
-
-    mpy::handle* begin() {
-        return args;
-    }
-    mpy::handle* end() {
-        return args + size();
-    }
-
-    mpy::handle operator[](int64_t i) const {
-        return args[i];
-    }
-    bool has_keywords() const {
-        return kwnames.ptr();
-    }
-    irange enumerate_positional() {
-        return irange(nargs);
-    }
-    irange enumerate_all() {
-        return irange(size());
-    }
-    int64_t size() const {
-        return nargs + (has_keywords() ? kwnames.size() : 0);
-    }
-
-    // bind a test function so this can be tested, first two args for required/kwonly, then return what was parsed...
-
-    // provide write kwarg
-    // don't provide a required arg
-    // don't provide an optional arg
-    // provide a kwarg that is the name of already provided positional
-    // provide a kwonly argument positionally
-    // provide keyword arguments in the wrong order
-    // provide only keyword arguments
-    void parse(const char * fname_cstr, std::initializer_list<const char*> names, std::initializer_list<mpy::handle*> values, int required, int kwonly=0) {
-        auto error = [&]() {
-            // rather than try to match the slower infrastructure with error messages exactly, once we have detected an error, just use that
-            // infrastructure to format it and throw it
-
-            // have to leak this, because python expects these to last
-            const char** names_buf = new const char*[names.size() + 1];
-            std::copy(names.begin(), names.end(), &names_buf[0]);
-            names_buf[names.size()] = nullptr;
-
-#if PY_VERSION_HEX < 0x03080000
-            char* format_str = new char[names.size() + 3];
-            int i = 0;
-            char* format_it = format_str;
-            for (auto it = names.begin(); it != names.end(); ++it, ++i) {
-                if (i == required) {
-                    *format_it++ = '|';
-                }
-                if (i == (int)names.size() - kwonly) {
-                    *format_it++ = '$';
-                }
-                *format_it++ = 'O';
-            }
-            *format_it++ = '\0';
-            _PyArg_Parser* _parser = new _PyArg_Parser{format_str, &names_buf[0], fname_cstr, 0};
-            PyObject *dummy = NULL;
-            _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy);
-#else
-            _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0};
-            auto buf = std::make_unique<PyObject*[]>(names.size());
-            _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]);
-#endif
-            throw exception_set();
-        };
-
-        auto values_it = values.begin();
-        auto names_it = names.begin();
-        auto npositional = values.size() - kwonly;
-
-        if (nargs > (Py_ssize_t)npositional) {
-            // TOO MANY ARGUMENTS
-            error();
-        }
-        for (auto i : irange(nargs)) {
-            *(*values_it++) = args[i];
-            ++names_it;
-        }
-
-        if (!kwnames.ptr()) {
-            if (nargs < required) {
-                // not enough positional arguments
-                error();
-            }
-        } else {
-            int consumed = 0;
-            for (auto i : irange(nargs, values.size())) {
-                bool success = i >= required;
-                const char* target_name = *(names_it++);
-                for (auto j : kwnames.enumerate()) {
-                    if (!strcmp(target_name,kwnames[j])) {
-                        *(*values_it) = args[nargs + j];
-                        ++consumed;
-                        success = true;
-                        break;
-                    }
-                }
-                ++values_it;
-                if (!success) {
-                    // REQUIRED ARGUMENT NOT SPECIFIED
-                    error();
-                }
-            }
-            if (consumed != kwnames.size()) {
-                // NOT ALL KWNAMES ARGUMENTS WERE USED
-                error();
-            }
-        }
-    }
-    int index(const char* name, int pos) {
-        if (pos < nargs) {
-            return pos;
-        }
-        if (kwnames.ptr()) {
-            for (auto j : kwnames.enumerate()) {
-                if (!strcmp(name, kwnames[j])) {
-                    return nargs + j;
-                }
-            }
-        }
-        return -1;
-    }
-};
-
-inline object handle::call_vector(vector_args args) {
-    return object::checked_steal(PY_VECTORCALL(ptr(), (PyObject*const*) args.args, args.nargs, args.kwnames.ptr()));
-}
-
-
-}
-
-#define MPY_ARGS_NAME(typ, name) #name ,
-#define MPY_ARGS_DECLARE(typ, name) typ name;
-#define MPY_ARGS_POINTER(typ, name) &name ,
-#define MPY_PARSE_ARGS_KWARGS(fmt, FORALL_ARGS) \
-    static char* kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \
-    FORALL_ARGS(MPY_ARGS_DECLARE) \
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, fmt, kwlist, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \
-        throw mpy::exception_set(); \
-    }
-
-#define MPY_PARSE_ARGS_KWNAMES(fmt, FORALL_ARGS) \
-    static const char * const kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \
-    FORALL_ARGS(MPY_ARGS_DECLARE) \
-    static _PyArg_Parser parser = {fmt, kwlist, 0}; \
-    if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \
-        throw mpy::exception_set(); \
-    }
diff --git a/functorch/csrc/dim/python_variable_simple.h b/functorch/csrc/dim/python_variable_simple.h
deleted file mode 100644
index d8c22ca312e35..0000000000000
--- a/functorch/csrc/dim/python_variable_simple.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-// note: pytorch's python variable simple includes pybind which conflicts with minpybind
-// so this file just reproduces the minimal API needed to extract Tensors from python objects.
-
-#include <torch/csrc/python_headers.h>
-#include <ATen/core/Tensor.h>
-#include <torch/csrc/Export.h>
-
-// Python object that backs torch.autograd.Variable
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-struct THPVariable {
-  PyObject_HEAD;
-  // Payload
-  c10::MaybeOwned<at::Tensor> cdata;
-  // Hooks to be run on backwards pass (corresponds to Python attr
-  // '_backwards_hooks', set by 'register_hook')
-  PyObject* backward_hooks = nullptr;
-};
-
-TORCH_PYTHON_API extern PyObject *THPVariableClass;
-TORCH_PYTHON_API extern PyObject *ParameterClass;
-
-TORCH_PYTHON_API PyObject * THPVariable_Wrap(const at::TensorBase& var);
-
-inline bool THPVariable_Check(PyObject *obj)
-{
-  if (!THPVariableClass)
-      return false;
-
-  const auto result = PyObject_IsInstance(obj, THPVariableClass);
-  AT_ASSERT(result != -1);
-  return result;
-}
-
-inline const at::Tensor& THPVariable_Unpack(THPVariable* var) {
-  return *var->cdata;
-}
-
-inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
-  return THPVariable_Unpack(reinterpret_cast<THPVariable*>(obj));
-}
-
-TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
diff --git a/functorch/csrc/init_dim_only.cpp b/functorch/csrc/init_dim_only.cpp
deleted file mode 100644
index 88d4cbcff7951..0000000000000
--- a/functorch/csrc/init_dim_only.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <torch/extension.h>
-#include <functorch/csrc/dim/dim.h>
-
-namespace at {
-namespace functorch {
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // initialize first-class dims and install it as a submodule on _C
-  auto dim = Dim_init();
-  if (!dim) {
-    throw py::error_already_set();
-  }
-  py::setattr(m, "dim", py::reinterpret_steal<py::object>(dim));
-}
-
-}}
diff --git a/functorch/dim/README.md b/functorch/dim/README.md
index 517930cb844b5..80435c2115c21 100644
--- a/functorch/dim/README.md
+++ b/functorch/dim/README.md
@@ -746,12 +746,14 @@ These compilers and language have syntax and semantics that resemble the loop-le
 
 Dimension objects are just an extension of the existing PyTorch tensors and eager semantics, so there is no friction switching between normal Python code and code that uses them. However, since loops over the dimensions are defined implicitly, they can still execute in Python with good performance compared to explicit loops. Furthermore, with dimension objects, a tensors containing dimensions can compute through code that is oblivious to the dimension such as batching examples. There is no need to separate code into 'compiled' vs 'eager'.
 
-In this way, first-class dims are a way of adapting the nicer syntax of these array compilers and languages to eager numpy-style libraries.
+In this way, first-class dims are a way of adapting the nicer syntax of these array compilers and languages to eager numpy-style libraries.  Note, however, that first class dimensions are not natively compiled, so if you write code that performs many outer products with the expectation of it being fused, you will generally not get good performance or memory use (except for matrix-multiply-like patterns specifically.)
 
 
 Performance Expectations
 ========================
-First-class dimensions are not a compiler. They provide syntax for existing PyTorch operations such as advanced indexing that is easier to read and write. For large sized tensors, the performance of any statements including them will be the same as using the already existing operations. An important exception is the pattern matching of products and summation, where performance will be improved by issuing to a matrix-multiply kernel. The C++ implementation of dimensions adds a small overhead of around 2us on top of PyTorch's normal overhead of 8us to each function that uses them. In the future, the implementation can incorporate more fusion optimization to further improve performance of this style of code.
+First-class dimensions are not a compiler. They provide syntax for existing PyTorch operations such as advanced indexing that is easier to read and write. For large sized tensors, the performance of any statements including them will be the same as using the already existing operations. An important exception is the pattern matching of products and summation, where performance will be improved by issuing to a matrix-multiply kernel.
+
+Originally, there was a C++ implementation of dimensions adds a small overhead of around 2us on top of PyTorch's normal overhead of 8us to each function that uses them.  However, this implementation had some manual memory managemetn bugs and was not kept up to date with CPython updates.  The latest Python implementation is two orders of magnitude slower due to CPU overhead; for overhead sensitive applications you should compile the code to eliminate this overhead.
 
 
 ## License
diff --git a/functorch/dim/__init__.py b/functorch/dim/__init__.py
index 95747181e848e..1d7a4307c3100 100644
--- a/functorch/dim/__init__.py
+++ b/functorch/dim/__init__.py
@@ -1,13 +1,402 @@
-import functorch._C
+from __future__ import annotations
+
+import dis
+import inspect
+import sys
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 import torch
-from functorch._C import dim as _C
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
+
+from ._dim_entry import _match_levels, DimEntry, ndim_of_levels
+from ._enable_all_layers import EnableAllLayers
+from ._py_inst_decoder import _PyInstDecoder
+from ._tensor_info import TensorInfo
+
+
+POINTWISE_OPTIMIZE = True
+DOT_OPTIMIZED = True
+
+# Global dimension level counter
+_n_dims_created = 0
+
+
+def _relevant_op(opcode: Optional[str]) -> bool:
+    """Check if opcode is relevant for variable assignment."""
+    return bool(opcode and opcode.startswith("STORE_"))
+
+
+def handle_from_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    """Handle tensor conversion for torch function integration."""
+    return tensor
+
+
+def _create_dim(name: str, size: Optional[int] = None) -> Dim:
+    """Create a new Dim object."""
+    return Dim(name, size if size is not None else -1)
+
+
+def dims(
+    n: Optional[int] = None, sizes: Optional[list[Optional[int]]] = None
+) -> Union[Dim, tuple[Dim, ...]]:
+    """
+    Create and return one or more Dim objects.
+
+    Uses bytecode inspection to determine variable names when possible.
+
+    Args:
+        n (int, optional): The number of dimensions to create. Can be omitted if sizes is specified.
+        sizes (List[Optional[int]], optional): A list the same size as the number of dimensions to be
+          created, specifying each dimensions size, or None to leave the size unset.
+
+    Returns:
+        Union[Dim, Tuple[Dim, ...]]: Single Dim if n=1, tuple of Dims otherwise.
+
+    Examples:
+        >>> batch, channel, width, height = dims(4)
+        >>> batch, channel, width, height = dims(sizes=[None, 3, 224, 224])
+        >>> single_dim = dims(1)
+    """
+    specified_ndims = -1
+    found_ndims = 0
+
+    # Parse arguments
+    if sizes is not None:
+        specified_ndims = len(sizes)
+    if n is not None:
+        specified_ndims = n
+
+    # Use bytecode inspection
+    frame = inspect.currentframe()
+    if frame is None:
+        raise RuntimeError("Unable to get current frame")
+    frame = frame.f_back
+    try:
+        if frame is None:
+            raise RuntimeError("Unable to get caller frame")
+        code = frame.f_code
+        lasti = frame.f_lasti
+
+        decoder = _PyInstDecoder(code, lasti)
+
+        if sys.version_info >= (3, 11):
+            if decoder.opcode() == "PRECALL":
+                decoder.next()
+
+        # Move to next instruction after the call
+        decoder.next()
+
+        # Determine number of dimensions from bytecode
+        if _relevant_op(decoder.opcode()):
+            found_ndims = 1
+        elif decoder.opcode() == "UNPACK_SEQUENCE":
+            found_ndims = decoder.oparg()
+            decoder.next()  # Move past UNPACK_SEQUENCE
+
+        if specified_ndims == -1:
+            if found_ndims == 0:
+                raise SyntaxError(
+                    "dims() must be assigned to a sequence of variable names or have argument n specified"
+                )
+            specified_ndims = found_ndims
+
+        if found_ndims != specified_ndims:
+            found_ndims = 0
+
+        def genobject(i: int) -> Dim:
+            nonlocal found_ndims
+            name = None
+            if i < found_ndims:
+                name = decoder.name()
+
+            if not name:
+                name = f"d{i}"
+                found_ndims = 0
+            else:
+                decoder.next()  # Move to next STORE instruction
+
+            size = sizes[i] if sizes is not None else None
+            return _create_dim(name, size)
+
+        # Validate sizes parameter
+        if sizes is not None and len(sizes) != specified_ndims:
+            raise ValueError(f"expected {specified_ndims} sizes but found {len(sizes)}")
+
+        if specified_ndims == 1:
+            return genobject(0)
+
+        result = []
+        for i in range(specified_ndims):
+            result.append(genobject(i))
+
+        return tuple(result)
+
+    finally:
+        del frame
+
+
+class DimList:
+    """
+    A list of first-class dimensions that can be bound to tensor dimensions.
+
+    A DimList can be in one of two states:
+    1. Unbound: Created with just a name, no specific dimensions yet
+    2. Bound: Either created with specific dimensions/sizes, or bound later via bind() or bind_len()
+    """
+
+    _name: Optional[str]
+    _dims: list[Dim]
+    _bound: bool
+
+    def __init__(
+        self,
+        len_or_dims: Optional[Union[int, Sequence]] = None,
+        name: Optional[str] = None,
+    ):
+        """
+        Initialize a new DimList object.
+
+        Args:
+            len_or_dims: Optional length (int) or sequence of dimensions/sizes
+            name: Optional name for the dimension list
+        """
+        # Initialize attributes
+        self._name = name
+        self._dims: list = []
+        self._bound = False
+
+        if isinstance(len_or_dims, int):
+            self.bind_len(len_or_dims)
+        elif len_or_dims is not None:
+            dims = []
+            for i, item in enumerate(len_or_dims):
+                if isinstance(item, int):
+                    dim_name = f"{self._name}{i}" if self._name else f"dim{i}"
+                    dims.append(Dim(dim_name, item))
+                else:
+                    dims.append(Dim(item))
+            self._set_dims(dims)
+
+    def _set_dims(self, dims: list) -> None:
+        """Set the dimensions and mark as bound."""
+        self._bound = True
+        self._dims = dims
+
+    def bind_len(self, size: int) -> None:
+        """
+        Bind this DimList to a specific length.
+
+        Args:
+            size: Number of dimensions to bind to
+
+        Raises:
+            DimensionBindError: If already bound to a different size
+        """
+        if self._bound:
+            if len(self._dims) != size:
+                raise DimensionBindError(
+                    f"Dimlist has size {len(self._dims)} but it is being bound to size {size}"
+                )
+        else:
+            self._bound = True
+            self._dims = []
+            for i in range(size):
+                dim_name = f"{self._name}{i}" if self._name else f"dim{i}"
+                self._dims.append(Dim(dim_name))
+
+    def bind(self, sizes: Sequence[int]) -> None:
+        """
+        Bind this DimList to specific sizes.
+
+        Args:
+            sizes: Sequence of sizes for each dimension
+
+        Raises:
+            ValueError: If sizes is not a sequence
+        """
+        if not hasattr(sizes, "__len__") or not hasattr(sizes, "__getitem__"):
+            raise ValueError("expected a sequence")
+
+        size = len(sizes)
+        self.bind_len(size)
+
+        for i, dim_size in enumerate(sizes):
+            self._dims[i].size = int(dim_size)
+
+    def _size(self) -> int:
+        if not self._bound:
+            raise DimensionBindError("DimList not bound")
+        return len(self._dims)
+
+    def size(self) -> int:
+        """Return the size (number of dimensions) of this DimList."""
+        return self._size()
+
+    def _set_bound(self, b: bool) -> None:
+        """Set the bound status (for internal use)."""
+        self._bound = b
+
+    @property
+    def is_bound(self) -> bool:
+        """Property to check if DimList is bound."""
+        return self._bound
+
+    def __len__(self) -> int:
+        """Return the length of the DimList."""
+        return self.size()
+
+    def __getitem__(self, key: Union[int, slice]) -> Union[Dim, tuple[Dim, ...]]:
+        if not self._bound:
+            raise DimensionBindError("DimList not bound")
+
+        if isinstance(key, int):
+            if key < 0 or key >= len(self._dims):
+                raise IndexError("index out of bounds")
+            return self._dims[key]
+        elif isinstance(key, slice):
+            start, stop, step = key.indices(len(self._dims))
+            result = []
+            for i in range(start, stop, step):
+                result.append(self._dims[i])
+            return tuple(result)
+        else:
+            raise ValueError("expected an int or a slice")
+
+    def __repr__(self) -> str:
+        """Return string representation of the DimList."""
+        if self._bound:
+            # Show as tuple representation
+            return f"({', '.join(repr(dim) for dim in self._dims)})"
+        elif self._name is not None:
+            # Show as *name for unbound with name
+            return f"*{self._name}"
+        else:
+            # Show as <unbound_dimlist> for unbound without name
+            return "<unbound_dimlist>"
+
+    def __str__(self) -> str:
+        """Return string representation of the DimList."""
+        return self.__repr__()
+
+    @classmethod
+    def __torch_function__(
+        cls,
+        func: Callable,
+        types: tuple,
+        args: tuple = (),
+        kwargs: Optional[dict] = None,
+    ) -> Any:
+        return _Tensor.__torch_function__(func, types, args, kwargs)
+
+
+def _create_dimlist(
+    name: str, size: Optional[Union[int, list[Optional[int]]]] = None
+) -> DimList:
+    """Create a DimList object with the given name and optional size."""
+    dimlist = DimList(name=name)
+    if size is not None:
+        if isinstance(size, int):
+            dimlist.bind_len(size)
+        else:
+            # size is a list of optional ints
+            dimlist.bind_len(len(size))
+            for i, s in enumerate(size):
+                if s is not None:
+                    dimlist._dims[i].size = s
+    return dimlist
+
+
+def dimlists(
+    n: Optional[int] = None, sizes: Optional[list[Optional[int]]] = None
+) -> Union[DimList, tuple[DimList, ...]]:
+    """
+    Create and return one or more DimList objects.
+
+    Similar to dims() but creates DimList objects instead.
+    """
+    specified_ndims = -1
+    found_ndims = 0
 
-from .tree_map import tree_flatten, tree_map
-from .wrap_type import wrap_type
+    # Parse arguments
+    if sizes is not None:
+        specified_ndims = len(sizes)
+    if n is not None:
+        specified_ndims = n
 
+    frame = inspect.currentframe()
+    if frame is None:
+        raise RuntimeError("Unable to get current frame")
+    frame = frame.f_back
+    try:
+        if frame is None:
+            raise RuntimeError("Unable to get caller frame")
+        code = frame.f_code
+        lasti = frame.f_lasti
 
-_C._patch_tensor_class()
-dims, DimList, dimlists = _C.dims, _C.DimList, _C.dimlists
+        decoder = _PyInstDecoder(code, lasti)
+
+        if sys.version_info >= (3, 11):
+            if decoder.opcode() == "PRECALL":
+                decoder.next()
+
+        # Move to next instruction after the call
+        decoder.next()
+
+        # Determine number of dimensions from bytecode
+        if _relevant_op(decoder.opcode()):
+            found_ndims = 1
+        elif decoder.opcode() == "UNPACK_SEQUENCE":
+            found_ndims = decoder.oparg()
+            decoder.next()  # Move past UNPACK_SEQUENCE
+
+        if specified_ndims == -1:
+            if found_ndims == 0:
+                raise SyntaxError(
+                    "dimlists() must be assigned to a sequence of variable names or have argument n specified"
+                )
+            specified_ndims = found_ndims
+
+        if found_ndims != specified_ndims:
+            found_ndims = 0
+
+        # Generator function for dimlist names
+        def genobject(i: int) -> str:
+            nonlocal found_ndims
+            name = None
+            if i < found_ndims:
+                name = decoder.name()
+
+            if not name:
+                name = f"d{i}"
+                found_ndims = 0
+            else:
+                decoder.next()  # Move to next STORE instruction
+
+            return name
+
+        # Validate sizes
+        if sizes is not None and len(sizes) != specified_ndims:
+            raise ValueError(f"expected {specified_ndims} sizes but found {len(sizes)}")
+
+        # Create dimlists
+        if specified_ndims == 1:
+            name = genobject(0)
+            return _create_dimlist(name, sizes[0] if sizes is not None else None)
+
+        result = []
+        for i in range(specified_ndims):
+            name = genobject(i)
+            size = sizes[i] if sizes is not None else None
+            result.append(_create_dimlist(name, size))
+
+        return tuple(result)
+
+    finally:
+        del frame
 
 
 class DimensionMismatchError(Exception):
@@ -21,80 +410,1123 @@ class DimensionBindError(Exception):
 from . import op_properties
 
 
-# use dict to avoid writing C++ bindings for set
-pointwise = dict.fromkeys(op_properties.pointwise, True)
+def _safe_print(*args: Any, **kwargs: Any) -> None:
+    """Safe print that avoids recursive torch function dispatches."""
+    import sys
+
+    # Convert any torch objects to basic representations
+    safe_args = []
+    for arg in args:
+        if hasattr(arg, "__class__") and "torch" in str(type(arg)):
+            safe_args.append(f"<{type(arg).__name__}>")
+        else:
+            safe_args.append(str(arg))
+
+    print(*safe_args, **kwargs, file=sys.stderr)
 
 
 class _Tensor:
-    # fast path around slow wrapping/unwrapping logic for simply queries used
-    # by the implementation...
+    def _get_levels(self) -> list[Any]:
+        raise NotImplementedError("_get_levels must be implemented by subclass")
+
+    def _get_tensor(self) -> Optional[torch.Tensor]:
+        raise NotImplementedError("_get_tensor must be implemented by subclass")
+
+    @property
+    def ndim(self) -> int:
+        raise NotImplementedError("ndim must be implemented by subclass")
 
     @property
-    def dims(self):
-        return tuple(d for d in self._levels if isinstance(d, Dim))
+    def dims(self) -> tuple[Any, ...]:
+        return tuple(l.dim() for l in self._get_levels() if not l.is_positional())
 
-    def dim(self):
+    def dim(self) -> int:
         return self.ndim
 
-    __torch_function__ = classmethod(_C.__torch_function__)
-    expand = _C._instancemethod(_C.expand)
+    @classmethod
+    def __torch_function__(
+        cls,
+        func: Callable,
+        types: tuple,
+        args: tuple = (),
+        kwargs: Optional[dict] = None,
+    ) -> Any:
+        if kwargs is None:
+            kwargs = {}
+
+        if DOT_OPTIMIZED and func is torch.Tensor.__mul__:
+            # Check conditions: 2 args, both are tensor-like, both 0-dimensional
+            if (
+                len(args) == 2
+                and not kwargs
+                and isinstance(args[0], (_Tensor, torch.Tensor))
+                and isinstance(args[1], (_Tensor, torch.Tensor))
+            ):
+                # Get tensor info for both operands
+                lhs_info = TensorInfo.create(
+                    args[0], ensure_batched=False, ensure_present=False
+                )
+                rhs_info = TensorInfo.create(
+                    args[1], ensure_batched=False, ensure_present=False
+                )
+
+                if (
+                    lhs_info
+                    and rhs_info
+                    and lhs_info.tensor is not None
+                    and rhs_info.tensor is not None
+                    and lhs_info.tensor.dim() == 0
+                    and rhs_info.tensor.dim() == 0
+                ):
+                    if (
+                        lhs_info.tensor.is_floating_point()
+                        and rhs_info.tensor.is_floating_point()
+                    ):
+                        # Collect all unique levels and has_device
+                        has_device = lhs_info.has_device or rhs_info.has_device
+                        levels = []
+
+                        for level in lhs_info.levels:
+                            if level not in levels:
+                                levels.append(level)
+                        for level in rhs_info.levels:
+                            if level not in levels:
+                                levels.append(level)
+
+                        # Debug print
+                        # print(f"DEBUG: Creating delayed mul, levels: {levels}, has_device: {has_device}")
+
+                        # Create delayed tensor
+                        return Tensor.create_delayed(func, args, levels, has_device)
+
+        if func is torch.Tensor.__getitem__:
+            from functorch.dim._getsetitem import getitem
+
+            return getitem(cls, func, types, args, kwargs)
+
+        if func is torch.Tensor.__setitem__:
+            from functorch.dim._getsetitem import setitem
+
+            # args should be (tensor, index, value)
+            if len(args) == 3:
+                setitem(args[0], args[1], args[2])
+                return None
+            else:
+                raise ValueError(f"Expected 3 args for __setitem__, got {len(args)}")
+
+        # Fast-path for len; mostly to avoid infinite loop in TestMinFunctorchOnly.test_softmax_split
+        if func is torch.Tensor.__len__:
+            return args[0].size(0)
+
+        # Special handling for torch.softmax - use the pre-wrapped version
+        if func is torch.softmax:
+            return softmax(*args, **kwargs)
+
+        # Special handling for torch.stack - use the custom stack function
+        if func is torch.stack:
+            return stack(*args, **kwargs)
+
+        if (
+            func is torch.Tensor.split
+            or func is torch._VF.split  # type: ignore[attr-defined]
+            or func is torch._VF.split_with_sizes  # type: ignore[attr-defined]
+            or func is torch.split
+        ):
+            return split(*args, **kwargs)
+
+        return _Tensor._torch_function_fallback(func, types, args, kwargs)
+
+    @staticmethod
+    def _torch_function_fallback(
+        func: Callable, types: tuple, args: tuple, kwargs: dict
+    ) -> Any:
+        """Fallback torch function implementation for non-special-cased functions."""
+        is_pointwise = POINTWISE_OPTIMIZE and func in op_properties.pointwise
+        # TODO: optimize pytree here
+        flat_args, spec = tree_flatten((args, kwargs))
+        device_holding_tensor = None
+
+        infos: list[TensorInfo] = []
+        result_levels: list[DimEntry] = []
+
+        for f in flat_args:
+            info = TensorInfo.create(f, not is_pointwise, False)
+            infos.append(info)
+            if info:
+                assert is_pointwise or info.batchedtensor is not None
+                if device_holding_tensor is None and info.has_device:
+                    device_holding_tensor = info.tensor
+                # Collect all unique levels
+                for level in info.levels:
+                    assert isinstance(level, DimEntry)
+                    if level not in result_levels:
+                        result_levels.append(level)
+
+        if is_pointwise:
+            # Pointwise operation: match all tensors to common levels
+            for i, info in enumerate(infos):
+                if info and info.tensor is not None:
+                    tensor = info.tensor
+                    if device_holding_tensor is not None and not info.has_device:
+                        tensor = tensor.to(device_holding_tensor.device)
+                    ml = _match_levels(tensor, info.levels, result_levels)
+                    flat_args[i] = handle_from_tensor(ml)
+
+            unflat_args, unflat_kwargs = tree_unflatten(flat_args, spec)
+            result = func(*unflat_args, **unflat_kwargs)
+
+            # Wrap tensor results
+            def wrap_tensor(obj: Any) -> Any:
+                if isinstance(obj, torch.Tensor):
+                    return Tensor.from_positional(
+                        obj, result_levels, device_holding_tensor is not None
+                    )
+                return obj
+
+            # Small fastpath
+            if isinstance(result, torch.Tensor):
+                return wrap_tensor(result)
+            else:
+                return tree_map(wrap_tensor, result)
+
+        # Non-pointwise operation: use functorch vmap layers
+        with EnableAllLayers(result_levels) as guard:
+            # Update arguments with batched tensors
+            for i, info in enumerate(infos):
+                if info and info.batchedtensor is not None:
+                    batched = info.batchedtensor
+                    if device_holding_tensor is not None and not info.has_device:
+                        batched = batched.to(device_holding_tensor.device)
+                    guard.inplace_update_layers(batched, info.levels)
+                    flat_args[i] = handle_from_tensor(batched)
+
+            unflat_args, unflat_kwargs = tree_unflatten(flat_args, spec)
+            result = func(*unflat_args, **unflat_kwargs)
+
+            # Unwrap results from functorch layers
+            def unwrap_tensor(obj: Any) -> Any:
+                if isinstance(obj, torch.Tensor):
+                    return guard.from_batched(obj, device_holding_tensor is not None)
+                return obj
+
+            if isinstance(result, torch.Tensor):
+                return unwrap_tensor(result)
+            else:
+                return tree_map(unwrap_tensor, result)
+
+    def __setitem__(self, index: Any, value: Any) -> None:
+        """Set values in tensor using first-class dimensions."""
+        from functorch.dim._getsetitem import setitem
+
+        return setitem(self, index, value)
+
+    # expand and index are OK to be methods because they don't have torch.*
+    # versions, but if they did they need the stack/cat treatment
+
+    def expand(self, *args: Dim) -> _Tensor:
+        """
+        Expand tensor by adding new dimensions or expanding existing dimensions.
+
+        If all arguments are Dim objects, adds new named dimensions.
+        Otherwise, falls back to regular tensor expansion behavior.
+
+        Args:
+            args: Either Dim objects for new dimensions or sizes for regular expansion
+
+        Returns:
+            New tensor with expanded dimensions
+
+        Example:
+            >>> i, j = dims()
+            >>> t = torch.randn(3, 4)
+            >>> expanded = t[i].expand(j, k)  # Add j, k dimensions
+            >>> expanded2 = t[i].expand(2, 4)  # Regular expand with sizes
+        """
+        info = TensorInfo.create(self, ensure_batched=False, ensure_present=False)
+
+        for arg in args:
+            if not isinstance(arg, Dim):
+                # Not all args are Dims, fallback to regular expand
+                if isinstance(self, torch.Tensor) and not isinstance(self, _Tensor):
+                    return torch.Tensor.expand(self, *args)
+                else:
+                    return self.__torch_function__(
+                        torch.Tensor.expand, (type(self),), (self,) + args
+                    )
+
+        # All args are Dim objects - proceed with first-class dimension expansion
+        if not info:
+            # No tensor info available, fallback
+            return self.__torch_function__(
+                torch.Tensor.expand, (type(self),), (self,) + args
+            )
+
+        # First-class dimension expansion - all args are Dim objects
+        data = info.tensor
+        if data is None:
+            # No tensor data available, fallback
+            return self.__torch_function__(
+                torch.Tensor.expand, (type(self),), (self,) + args
+            )
+
+        levels = info.levels
+
+        new_levels: list[DimEntry] = []
+        new_sizes = []
+        new_strides = []
+
+        for d in args:
+            # Check if dimension already exists in current levels or new_levels
+            for level in levels:
+                if not level.is_positional() and level.dim() is d:
+                    raise DimensionBindError(
+                        f"expanding dimension {d} already exists in tensor with dims"
+                    )
+            for new_level in new_levels:
+                if not new_level.is_positional() and new_level.dim() is d:
+                    raise DimensionBindError(
+                        f"expanding dimension {d} already exists in tensor with dims"
+                    )
+
+            new_levels.append(DimEntry(d))
+            new_sizes.append(d.size)
+            new_strides.append(0)
+
+        # Add existing levels
+        new_levels.extend(levels)
+
+        # Add existing sizes and strides
+        orig_sizes = list(data.size())
+        orig_strides = list(data.stride())
+        new_sizes.extend(orig_sizes)
+        new_strides.extend(orig_strides)
+
+        # Create expanded tensor using as_strided
+        expanded_data = data.as_strided(new_sizes, new_strides, data.storage_offset())
+
+        # Return new tensor with expanded dimensions
+        result = Tensor.from_positional(expanded_data, new_levels, info.has_device)
+        return result  # type: ignore[return-value]  # Tensor and torch.Tensor are interchangeable
+
+    def index(
+        self,
+        dims: Union[int, Dim, tuple[Union[int, Dim], ...], list[Union[int, Dim]]],
+        indices: Union[
+            int,
+            slice,
+            torch.Tensor,
+            tuple[Union[int, slice, torch.Tensor], ...],
+            list[Union[int, slice, torch.Tensor]],
+        ],
+    ) -> _Tensor:
+        """
+        Index tensor using first-class dimensions.
+        """
+        from ._dim_entry import _match_levels
+        from ._getsetitem import getsetitem_flat, invoke_getitem
+        from ._wrap import _wrap_dim
+
+        # Helper to check if obj is a dimpack (tuple/list) and extract items
+        def maybe_dimpack(obj: Any, check_first: bool = False) -> tuple[Any, bool]:
+            if isinstance(obj, (tuple, list)):
+                return list(obj), True
+            return None, False
+
+        def parse_dim_entry(s: Any) -> Any:
+            d = _wrap_dim(s, self.ndim, False)
+            if d.is_none():
+                raise TypeError(f"expected a dimension specifyer but found {repr(s)}")
+            return d
+
+        # Helper for dimension not present errors
+        def dim_not_present(d: Any) -> None:
+            if d.is_positional():
+                raise TypeError(
+                    f"dimension {d.position() + self.ndim} not in tensor of {self.ndim} dimensions"
+                )
+            else:
+                raise TypeError(f"dimension {repr(d.dim())} not in tensor")
+
+        dims_list: list[Union[int, Dim]] = []
+        indices_list: list[Union[int, slice, torch.Tensor]] = []
 
-    index = _C._instancemethod(_C.index)
+        lhs_list = isinstance(dims, (tuple, list))
+        rhs_list = isinstance(indices, (tuple, list))
 
-    def __repr__(self):
-        tensor, levels, ndim = self._tensor, self._levels, self.ndim
-        return f"{tensor}\nwith dims={tuple(l + ndim if isinstance(l, int) else l for l in levels)} sizes={tuple(tensor.size())}"
+        if lhs_list and rhs_list:
+            # Type narrowing: we know dims and indices are sequences here
+            dims_seq = dims  # type: ignore[assignment]
+            indices_seq = indices  # type: ignore[assignment]
+            if len(dims_seq) != len(indices_seq):  # type: ignore[arg-type]
+                raise TypeError(
+                    f"dims ({len(dims_seq)}) and indices ({len(indices_seq)}) must have the same length"  # type: ignore[arg-type]
+                )
+            dims_list.extend(dims_seq)  # type: ignore[arg-type]
+            indices_list.extend(indices_seq)  # type: ignore[arg-type]
+        else:
+            dims_list.append(dims)  # type: ignore[arg-type]
+            indices_list.append(indices)  # type: ignore[arg-type]
+
+        # Create tensor info
+        self_info = TensorInfo.create(self, False, False)
+
+        new_levels: list[Any] = []
+        to_flatten: list[Any] = []
+        dims_list_flat = []
+
+        # Process each dim specification
+        for i in range(len(dims_list)):
+            m, is_dimpack = maybe_dimpack(dims_list[i], check_first=False)
+            if is_dimpack:
+                if len(m) == 0:
+                    dims_list_flat.append(DimEntry())  # Empty dimpack
+                    continue
+
+                first = parse_dim_entry(m[0])
+                dims_list_flat.append(first)
+
+                if len(m) == 1:
+                    continue
+
+                # Multi-element dimpack requires flattening
+                if len(to_flatten) == 0:
+                    new_levels.extend(self_info.levels)
+
+                rest = []
+                for j in range(1, len(m)):
+                    d = parse_dim_entry(m[j])
+                    removed = False
+                    for k in range(len(new_levels)):
+                        if new_levels[k] == d:
+                            new_levels.pop(k)
+                            removed = True
+                            break
+                    if not removed:
+                        dim_not_present(d)
+                    rest.append(d)
+
+                # Find first in new_levels
+                first_idx = None
+                for k in range(len(new_levels)):
+                    if new_levels[k] == first:
+                        first_idx = k
+                        break
+                if first_idx is None:
+                    dim_not_present(first)
+                    continue  # Skip this iteration if dimension not found
+
+                for j, r in enumerate(rest):
+                    new_levels.insert(first_idx + 1 + j, r)
+                to_flatten.extend(rest)
+            else:
+                dims_list_flat.append(parse_dim_entry(dims_list[i]))
+
+        # Handle dimension flattening if needed
+        if len(to_flatten) > 0:
+            assert self_info.tensor is not None, (
+                "Cannot perform dimension flattening on None tensor"
+            )
+            rearranged = _match_levels(self_info.tensor, self_info.levels, new_levels)
+            sizes = rearranged.size()
+            new_sizes: list[Any] = []
+            reshape_levels = []
+
+            for i in range(len(new_levels)):
+                if new_levels[i] in to_flatten:
+                    if len(new_sizes) == 0:
+                        new_sizes.append(sizes[i])
+                    else:
+                        new_sizes[-1] *= sizes[i]
+                else:
+                    new_sizes.append(sizes[i])
+                    reshape_levels.append(new_levels[i])
+
+            self_info.tensor = rearranged.reshape(new_sizes)
+            self_info.levels = reshape_levels
+
+        # Check for dimpacks in indices
+        has_dimpacks = False
+        for idx in indices_list:
+            if isinstance(idx, (tuple, list)):
+                has_dimpacks = True
+                break
+
+        # Call getsetitem_flat with correct parameters
+        info = getsetitem_flat(
+            self_info,
+            [],  # empty input_list
+            dims_list_flat,  # keys
+            indices_list,  # values
+            has_dimpacks,
+        )
+
+        return invoke_getitem(info)
+
+    def __repr__(self) -> str:
+        tensor, levels, ndim = self._get_tensor(), self._get_levels(), self.ndim
+        dims_repr = []
+        for l in levels:
+            if hasattr(l, "is_positional") and l.is_positional():
+                # Convert negative positional to positive: -1 -> ndim-1, -2 -> ndim-2, etc.
+                dims_repr.append(l.position() + ndim)
+            elif hasattr(l, "dim"):
+                dims_repr.append(l.dim())
+            elif hasattr(l, "data"):
+                dims_repr.append(l.data)
+            else:
+                dims_repr.append(l)
+        return f"{tensor}\nwith dims={tuple(dims_repr)} sizes={tuple(tensor.size())}"  # type: ignore[union-attr]
 
 
 TensorLike = (_Tensor, torch.Tensor)
 
 
-class Dim(_C.Dim, _Tensor):
-    # note that _C.Dim comes before tensor because we want the Dim API for things like size to take precedence.
+class Dim(_Tensor):
+    _level: int
+    _name: str
+    _size: int
+    _range: Optional[torch.Tensor]
+    _batchtensor: Optional[torch.Tensor]
+
+    def __init__(self, name: str, s: int = -1) -> None:
+        global _n_dims_created
+        self._name = name
+        self._size = s
+        self._level = _n_dims_created
+        _n_dims_created += 1
+        self._range = None
+        self._batchtensor = None
+
+    @property
+    def ndim(self) -> int:
+        return 1
+
+    @classmethod
+    def check_exact(cls, obj: Any) -> bool:
+        return type(obj) is cls
+
+    @property
+    def size(self) -> int:
+        if self._size == -1:
+            raise ValueError(f"dimension {self._name} is unbound")
+        return self._size
+
+    @size.setter
+    def size(self, v: int) -> None:
+        if self._size == -1:
+            self._size = v
+        elif self._size != v:
+            raise DimensionBindError(
+                f"Dim '{repr(self)}' previously bound to a dimension of size {self._size} "
+                f"cannot bind to a dimension of size {v}"
+            )
+
+    @property
+    def is_bound(self) -> bool:
+        """Return True if this dimension is bound to a size."""
+        return self._size != -1
+
+    def _get_range(self) -> torch.Tensor:
+        """
+        Get a tensor representing the range [0, size) for this dimension.
+
+        Returns:
+            A 1D tensor with values [0, 1, 2, ..., size-1]
+        """
+        if self._range is None:
+            self._range = torch.arange(self.size)
+        return self._range
+
+    def _get_batchtensor(self) -> torch.Tensor:
+        """
+        Get a batched tensor representation of this dimension.
+
+        Returns:
+            A batched tensor created from the range tensor
+        """
+        if self._batchtensor is None:
+            self._batchtensor = torch._C._functorch._add_batch_dim(
+                self._get_range(), 0, self._level
+            )
+        return self._batchtensor
+
+    def __repr__(self) -> str:
+        """String representation of a Dim object."""
+        return self._name
+
+    # note that Dim comes before tensor because we want the Dim API for things like size to take precedence.
     # Tensor defines format, but we want to print Dims with special formatting
     __format__ = object.__format__
 
 
-class Tensor(_Tensor, _C.Tensor):
-    from_positional = staticmethod(_C.Tensor_from_positional)
-    sum = _C._instancemethod(_C.Tensor_sum)
+# Somewhat confusingly, an FCD tensor is also called Tensor.  This confusion
+# is somewhat intentional, as FCD tensors are intended to be substitutable
+# with regular Tensor (just with some positional dims hidden).
+class Tensor(_Tensor):
+    _tensor: Optional[torch.Tensor]
+    _batchtensor: Optional[torch.Tensor]
+    _levels: list[DimEntry]
+    _has_device: bool
+    _delayed: Optional[Callable[[], torch.Tensor]]
+    _delayed_orig: Optional[Callable]
+    _delayed_args: Optional[tuple]
+
+    @property
+    def ndim(self) -> int:
+        return sum(1 if l.is_positional() else 0 for l in self._levels)
 
+    @classmethod
+    def check_exact(cls, other: Any) -> bool:
+        return type(other) is cls
 
-def cat(tensors, dim, new_dim):
-    n = dims()
-    return stack(tensors, n, dim).index([n, dim], new_dim)
+    @classmethod
+    def from_positional(
+        cls, tensor: torch.Tensor, levels: list[DimEntry], has_device: bool
+    ) -> Union[_Tensor, torch.Tensor]:
+        """
+        Create a functorch Tensor from a regular PyTorch tensor with specified dimension levels.
 
+        This is the primary way to create Tensor objects with first-class dimensions.
 
-_wrap = _C._wrap
+        Args:
+            tensor: The underlying PyTorch tensor
+            levels: List of DimEntry objects specifying the dimension structure
+            has_device: Whether the tensor is on a device (not CPU)
 
+        Returns:
+            A new Tensor instance with the specified dimensions, or a regular torch.Tensor
+            if there are no named dimensions
+        """
+        seen_dims = 0
+        last = 0
 
-def _def(name, *args, **kwargs):
-    orig = getattr(torch.Tensor, name)
-    setattr(_Tensor, name, _C._instancemethod(_wrap(orig, *args, **kwargs)))
+        for i, l in enumerate(levels):
+            if l.is_positional():
+                # Validate consecutive positional dimensions
+                assert last == 0 or last + 1 == l.position(), (
+                    f"Positional dimensions must be consecutive, got {last} then {l.position()}"
+                )
+                last = l.position()
+            else:
+                # This is a named dimension
+                seen_dims += 1
+
+        # Validate final positional dimension
+        assert last == 0 or last == -1, (
+            f"Final positional dimension must be 0 or -1, got {last}"
+        )
+
+        if not seen_dims:
+            return tensor
+
+        # Create Tensor object with proper level management
+        result = cls()
+        result._tensor = tensor
+        result._levels = levels
+        result._has_device = has_device
+        result._batchtensor = None  # Will be created lazily if needed
+        result._delayed = None
+        result._delayed_orig = None
+        result._delayed_args = None
+
+        # Validate tensor dimensionality matches levels
+        assert tensor.dim() == len(levels), (
+            f"Tensor has {tensor.dim()} dimensions but {len(levels)} levels provided"
+        )
+
+        return result
+
+    @classmethod
+    def create_delayed(
+        cls, orig: Callable, args: tuple, levels: list[DimEntry], has_device: bool
+    ) -> _Tensor:
+        """
+        Create a delayed tensor that defers the operation until later.
+        """
+        result = cls()
+        result._tensor = None  # Will be computed when needed
+        result._levels = levels
+        result._has_device = has_device
+        result._batchtensor = None
+        result._delayed_orig = orig
+        result._delayed_args = args
+
+        # Create delayed evaluation function that unwraps Tensor objects
+        def evaluate_delayed() -> torch.Tensor:
+            unwrapped_args = []
+            for arg in args:
+                if hasattr(arg, "_get_tensor"):
+                    unwrapped_args.append(arg._get_tensor())
+                else:
+                    unwrapped_args.append(arg)
+            return orig(*unwrapped_args)
+
+        result._delayed = evaluate_delayed
+
+        return result
+
+    def _get_tensor(self) -> Optional[torch.Tensor]:
+        """Get the underlying tensor, handling delayed operations if needed."""
+        if (
+            hasattr(self, "_delayed")
+            and self._delayed is not None
+            and self._tensor is None
+        ):
+            # Execute the delayed operation
+            self._tensor = self._delayed()
+            # Clear delayed operation to avoid re-execution
+            self._delayed = None
+            self._delayed_orig = None
+            self._delayed_args = None
+        return self._tensor
+
+    def _get_levels(self) -> list[Any]:
+        """Get the dimension levels."""
+        return self._levels
+
+    def _get_has_device(self) -> bool:
+        """Get whether this tensor has device information."""
+        return self._has_device
+
+    def _get_batchtensor(self) -> Optional[torch.Tensor]:
+        """Get the batched tensor representation, creating it lazily if needed."""
+        if self._batchtensor is None:
+            self._batchtensor = self._add_batch_dims(
+                self._get_tensor(), self._get_levels()
+            )
+        return self._batchtensor
+
+    def _add_batch_dims(
+        self, t: Optional[torch.Tensor], levels_: list[Any]
+    ) -> Optional[torch.Tensor]:
+        levels = list(levels_)
+
+        while True:
+            min_real_index = -1
+            min_index = -1
+            min_value = float("inf")  # INT_MAX equivalent
+            i = 0
+            r = 0
+
+            for r, l in enumerate(levels):
+                if not l.is_none():
+                    if not l.is_positional() and l.dim()._level < min_value:
+                        min_value = l.dim()._level
+                        min_index = i
+                        min_real_index = r
+                    i += 1
+
+            if min_index == -1:
+                return t
+
+            assert t is not None
+            t = torch._C._functorch._add_batch_dim(t, min_index, int(min_value))
+
+            levels[min_real_index] = DimEntry()
+        return None
+
+    def order(self, *dims: Any) -> _Tensor:
+        """Reorder the dimensions of this tensor."""
+        from ._order import order
+
+        result = order(self, *dims)
+        return result  # type: ignore[return-value]  # Tensor and torch.Tensor are interchangeable
+
+
+def stack(tensors: Any, new_dim: Any, dim: int = 0) -> _Tensor:
+    """
+    Stack tensors along a new dimension.
+
+    Args:
+        tensors: Sequence of tensors to stack
+        new_dim: The new Dim to create for stacking
+        dim: The dimension position to insert the new dimension (default: 0)
+
+    Returns:
+        Stacked tensor with the new dimension
+    """
+    if not tensors:
+        raise ValueError("stack expects a non-empty sequence of tensors")
+
+    # Check if new_dim is a Dim object
+    if not isinstance(new_dim, Dim):
+        # Fall back to regular torch.stack
+        result = torch.stack(tensors, dim=dim)
+        return result  # type: ignore[return-value]
+
+    # Collect all result_levels from input tensors
+    result_levels = []
+    infos = []
+
+    for t in tensors:
+        info = TensorInfo.create(t, ensure_batched=False, ensure_present=False)
+        infos.append(info)
+        for level in info.levels:
+            if level not in result_levels:
+                result_levels.append(level)
+
+    # Set the new_dim size to match number of tensors
+    new_dim.size = len(tensors)
+
+    # Match all tensors to the common level structure using _match_levels
+    inputs = []
+    for info in infos:
+        assert info.tensor is not None, "Cannot stack tensors with None tensor data"
+        matched_tensor = _match_levels(info.tensor, info.levels, result_levels)
+        inputs.append(matched_tensor)
+
+    # Calculate ndim and resolve the dim parameter
+    ndim = ndim_of_levels(result_levels)
+    rawdim = 0
+    if dim is not None and not (isinstance(dim, int) and dim == 0):
+        from ._wrap import _wrap_dim
+
+        d = _wrap_dim(dim, ndim, False)
+        try:
+            idx = result_levels.index(d)
+        except ValueError:
+            raise TypeError(f"Dimension {dim} does not exist in inputs") from None
+        rawdim = idx
+
+    # Stack tensors at the resolved dimension
+    result = torch.stack(inputs, rawdim)
 
+    # Insert new dimension entry at the correct position
+    result_levels.insert(rawdim, DimEntry(new_dim))
 
-t__getitem__ = _C._instancemethod(_C.__getitem__)
-stack = _C.stack
-split = _C._instancemethod(_C.split)
+    # Return as a first-class tensor
+    tensor_result = Tensor.from_positional(
+        result, result_levels, infos[0].has_device if infos else True
+    )
+    return tensor_result  # type: ignore[return-value]
 
-# note: there is no python reference
-t__setitem__ = _C._instancemethod(_C.__setitem__)
-# this is patched in the C API because otherwise torch.Tensor will
-# no longer be considered a sequence and things will break
-# torch.Tensor.__getitem__ = t__getitem__
 
-_Tensor.__getitem__ = t__getitem__
-# torch.Tensor.__setitem__ = t__setitem__
-_Tensor.__setitem__ = t__setitem__
+def split(tensor: Any, split_size_or_sections: Any, dim: Any = None) -> tuple:
+    """
+    Split tensor along a dimension.
+
+    Can handle both regular integer sizes and Dim objects for split sizes.
+    When Dim objects are used, they get bound to the resulting tensor dimensions.
+    """
+    from ._wrap import _wrap_dim
+
+    # Check if dim is a Dim object
+    dim_is_object = isinstance(dim, Dim)
+
+    # Parse split_size_or_sections
+    if isinstance(split_size_or_sections, int):
+        # Single integer - use regular split
+        if dim_is_object:
+            raise TypeError(
+                "when dim is specified as a Dim object, split sizes must also be dimensions."
+            )
+        return _Tensor._torch_function_fallback(
+            torch.Tensor.split,
+            (type(tensor),),
+            (tensor, split_size_or_sections),
+            {"dim": dim},
+        )
+
+    # Check if it's a sequence
+    sizes = []
+    all_dims = True
+    all_ints = True
+
+    for item in split_size_or_sections:
+        sizes.append(item)
+        if isinstance(item, Dim):
+            all_ints = False
+        else:
+            all_dims = False
+
+    if all_ints:
+        # All integers - use regular split
+        if dim_is_object:
+            raise TypeError(
+                "when dim is specified as a Dim object, split sizes must also be dimensions."
+            )
+        return _Tensor._torch_function_fallback(
+            torch.Tensor.split,
+            (type(tensor),),
+            (tensor, split_size_or_sections),
+            {"dim": dim},
+        )
+
+    if not all_dims:
+        raise TypeError("split list must be ints or dims but got a mix")
+
+    # All are Dim objects - handle first-class dimension split
+    self_info = TensorInfo.create(tensor, ensure_batched=False, ensure_present=False)
+    ndim = self_info.ndim()
+
+    if not dim_is_object and ndim == 0:
+        raise TypeError("split expects at least a 1-dimension tensor")
+
+    # Wrap the dimension
+    dim_l = _wrap_dim(dim, ndim, False) if dim is not None else DimEntry(-ndim)
+
+    # Find the index of the dimension in levels
+    idx = None
+    for i, level in enumerate(self_info.levels):
+        if level == dim_l:
+            idx = i
+            break
+
+    if idx is None:
+        if dim is None:
+            dim = 0
+        raise TypeError(f"tensor does not contain dimension {dim}")
+
+    # Calculate split indices
+    indices = []
+    total_size = 0
+    unbound = []
+
+    for i, size_dim in enumerate(sizes):
+        if size_dim.is_bound:
+            indices.append(size_dim.size)
+            total_size += indices[-1]
+        else:
+            indices.append(0)
+            unbound.append(i)
+
+    assert self_info.tensor is not None, "Cannot get tensor size on None tensor"
+    tensor_size = self_info.tensor.size(idx)
+
+    # Handle unbound dimensions
+    if unbound:
+        if total_size > tensor_size:
+            raise TypeError(
+                f"sizes of target dimensions add up to more ({total_size}) than source dim ({tensor_size})"
+            )
+        remaining_size = tensor_size - total_size
+        chunk_size = (remaining_size + len(unbound) - 1) // len(unbound)
+        for u in unbound:
+            sz = min(chunk_size, remaining_size)
+            sizes[u].size = sz
+            indices[u] = sz
+            remaining_size -= sz
+    elif tensor_size != total_size:
+        raise TypeError(
+            f"sum of sizes of target dimensions ({total_size}) do not match the source dim ({tensor_size})"
+        )
+
+    # Perform the split
+    result_tensors = self_info.tensor.split_with_sizes(indices, idx)
+
+    # Create result with new levels
+    result = []
+    new_levels = list(self_info.levels)
+
+    for i, (result_tensor, size_dim) in enumerate(zip(result_tensors, sizes)):
+        new_levels[idx] = DimEntry(size_dim)
+        result.append(
+            Tensor.from_positional(
+                result_tensor, list(new_levels), self_info.has_device
+            )
+        )
+
+    return tuple(result)
+
+
+def cat(tensors: Any, dim: Any, new_dim: Any) -> _Tensor:
+    n = dims(1)  # Get single Dim instead of tuple
+    return stack(tensors, n, dim).index([n, dim], new_dim)  # type: ignore[list-item]
+
+
+class DotPart:
+    """
+    Helper class for organizing dimensions in dot products.
+    """
+
+    def __init__(self) -> None:
+        self.dims: list[DimEntry] = []
+        self.total_size = 1
+
+    def append(self, dim_entry: Any) -> None:
+        """Add a dimension entry to this part."""
+        self.dims.append(dim_entry)
+        if not dim_entry.is_positional():
+            self.total_size *= dim_entry.dim().size
+
+
+def dot_prepare(parts: list[DotPart], tensor_info: TensorInfo) -> torch.Tensor:
+    """
+    Prepare tensor for dot product by matching levels and reshaping.
+    """
+    new_levels = []
+    needs_reshape = False
+
+    for part in parts:
+        if len(part.dims) != 1:
+            needs_reshape = True
+        new_levels.extend(part.dims)
+
+    if tensor_info.tensor is None:
+        raise RuntimeError("Cannot perform dot product on None tensor")
+    result = _match_levels(tensor_info.tensor, tensor_info.levels, new_levels)
+
+    if not needs_reshape:
+        return result
+
+    # Reshape for matrix operations
+    view = [part.total_size for part in parts]
+    return result.reshape(view)
+
+
+def dot_finish(parts: list[DotPart], result_tensor: torch.Tensor) -> Tensor:
+    """
+    Finish dot product by reshaping result and creating Tensor.
+    """
+    result_levels = []
+    needs_reshape = False
+
+    for part in parts:
+        if len(part.dims) != 1:
+            needs_reshape = True
+        result_levels.extend(part.dims)
+
+    if needs_reshape:
+        new_size = []
+        for level in result_levels:
+            new_size.append(level.dim().size)
+        result_tensor = result_tensor.reshape(new_size)
+
+    tensor_result = Tensor.from_positional(result_tensor, result_levels, True)
+    return tensor_result  # type: ignore[return-value]
+
+
+def dot(lhs: Any, rhs: Any, sum_dims: Any) -> Union[_Tensor, torch.Tensor]:
+    """
+    Perform dot product between two tensors along specified dimensions.
+
+    Args:
+        lhs: Left-hand side tensor
+        rhs: Right-hand side tensor
+        sum_dims: Dimensions to sum over (contract)
+
+    Returns:
+        Result of dot product
+    """
+    # Get tensor info
+    lhs_info = TensorInfo.create(lhs, ensure_batched=False, ensure_present=False)
+    rhs_info = TensorInfo.create(rhs, ensure_batched=False, ensure_present=False)
+
+    if not (lhs_info and rhs_info):
+        # Fall back to regular operations
+        return torch.matmul(lhs, rhs)
+
+    assert lhs_info.tensor is not None and rhs_info.tensor is not None, (
+        "Cannot perform dot product on None tensors"
+    )
+
+    lhs_strides = lhs_info.tensor.stride()
+    rhs_strides = rhs_info.tensor.stride()
+
+    # Create dot parts for different dimension categories
+    lro_dims = DotPart()  # Left-right-output (batch dims)
+    lo_dims = DotPart()  # Left-output only
+    ro_dims = DotPart()  # Right-output only
+    lr_dims = DotPart()  # Left-right (contracted dims)
+
+    def insert_dim(d: Any, lhs_idx: Any, rhs_idx: Any) -> None:
+        """Insert dimension into appropriate part based on stride pattern."""
+        reduced = d in sum_dims
+        lhs_stride = lhs_strides[lhs_idx] if lhs_idx is not None else 0
+        rhs_stride = rhs_strides[rhs_idx] if rhs_idx is not None else 0
+
+        if reduced:
+            lr_dims.append(d)
+        else:
+            if (lhs_stride == 0) == (rhs_stride == 0):
+                lro_dims.append(d)  # Both have or both lack this dim
+            elif lhs_stride != 0:
+                lo_dims.append(d)  # Only lhs has this dim
+            else:
+                ro_dims.append(d)  # Only rhs has this dim
+
+    # Track which rhs dimensions we've seen
+    rhs_seen = [False] * len(rhs_info.levels)
+
+    # Process lhs dimensions
+    for i, lhs_level in enumerate(lhs_info.levels):
+        rhs_idx = None
+        for j, rhs_level in enumerate(rhs_info.levels):
+            if lhs_level == rhs_level:
+                rhs_idx = j
+                rhs_seen[j] = True
+                break
+
+        insert_dim(lhs_level, i, rhs_idx)
+
+    # Process remaining rhs dimensions
+    for i, rhs_level in enumerate(rhs_info.levels):
+        if not rhs_seen[i]:
+            insert_dim(rhs_level, None, i)
+
+    # Validate sum dimensions exist
+    if len(lr_dims.dims) != len(sum_dims):
+        for d in sum_dims:
+            if d not in lhs_info.levels and d not in rhs_info.levels:
+                raise ValueError(f"summing over non-existent dimension {d}")
+
+    # Prepare tensors and perform matrix multiplication
+    if len(lro_dims.dims) != 0:
+        # Batched matrix multiply
+        lhs_tensor = dot_prepare([lro_dims, lo_dims, lr_dims], lhs_info)
+        rhs_tensor = dot_prepare([lro_dims, lr_dims, ro_dims], rhs_info)
+        result = torch.bmm(lhs_tensor, rhs_tensor)
+        return dot_finish([lro_dims, lo_dims, ro_dims], result)
+    else:
+        # Regular matrix multiply
+        lhs_tensor = dot_prepare([lo_dims, lr_dims], lhs_info)
+        rhs_tensor = dot_prepare([lr_dims, ro_dims], rhs_info)
+        result = torch.mm(lhs_tensor, rhs_tensor)
+        return dot_finish([lo_dims, ro_dims], result)
+
+
+from functorch.dim._wrap import _wrap
+from functorch.dim.wrap_type import wrap_type
+
 
-torch.Tensor.split = split
-_Tensor.split = split
-torch.Tensor.expand = _C._instancemethod(_C.expand)
-torch.Tensor.index = _C._instancemethod(_C.index)
 wrap_type(_Tensor, torch.Tensor, _Tensor.__torch_function__)
 del _Tensor.ndim
 
-_Tensor.order = _C._instancemethod(_C.order)
+
+def index(self: Any, positions: Any, dims: Any) -> _Tensor:
+    """
+    Index a regular tensor by binding specified positions to dims.
+
+    This converts a regular tensor to a first-class tensor by binding
+    the specified positional dimensions to Dim objects.
+
+    Args:
+        positions: Tuple of dimension positions to bind
+        dims: Dim objects or tuple of Dim objects to bind to
+
+    Returns:
+        First-class tensor with specified dimensions bound
+    """
+    # If this is already a first-class tensor (_Tensor), call its index method directly
+    if isinstance(self, _Tensor):
+        return _Tensor.index(self, positions, dims)
+
+    # Convert regular tensor to first-class tensor
+    info = TensorInfo.create(self, ensure_batched=False, ensure_present=False)
+
+    # Create the first-class tensor
+    assert info.tensor is not None, "Cannot index None tensor"
+    result = Tensor.from_positional(info.tensor, info.levels, info.has_device)
+
+    # Now call the index method on the first-class tensor
+    # Cast result to _Tensor for the method call
+    return _Tensor.index(result, positions, dims)  # type: ignore[arg-type]
+
+
+def _def(name: str, *args: Any, **kwargs: Any) -> None:
+    orig = getattr(torch.Tensor, name)
+    setattr(_Tensor, name, _wrap(orig, *args, **kwargs))
+
 
 _def("mean")
 _def("sum")
diff --git a/functorch/dim/_dim_entry.py b/functorch/dim/_dim_entry.py
new file mode 100644
index 0000000000000..c067a7ad0ce40
--- /dev/null
+++ b/functorch/dim/_dim_entry.py
@@ -0,0 +1,127 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Union
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from . import Dim
+
+import torch  # noqa: TC002
+
+
+# NB: The old code represented dimension was from as negative number, so we
+# follow this convention even though it shouldn't be necessary now
+class DimEntry:
+    # The dimension this is from the rhs, or a FCD
+    data: Union[Dim, int]
+
+    def __init__(self, data: Union[Dim, int, None] = None) -> None:
+        from . import Dim
+
+        if type(data) is int:
+            assert data < 0
+        elif data is None:
+            data = 0
+        else:
+            assert isinstance(data, Dim)
+        self.data = data
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, DimEntry):
+            return False
+        # Use 'is' for Dim objects to avoid triggering __torch_function__
+        # Use '==' only for positional (int) comparisons
+        if self.is_positional() and other.is_positional():
+            # Both are positional (ints)
+            return self.data == other.data
+        elif not self.is_positional() and not other.is_positional():
+            # Both are Dim objects - use 'is' to avoid __eq__
+            return self.data is other.data
+        else:
+            # One is positional, one is Dim - they can't be equal
+            return False
+
+    def is_positional(self) -> bool:
+        return type(self.data) is int and self.data < 0
+
+    def is_none(self) -> bool:
+        # Use isinstance to check for Dim objects, avoid triggering __torch_function__
+        from . import Dim
+
+        if isinstance(self.data, Dim):
+            # This is a Dim object, it can't be "none" (which is represented by 0)
+            return False
+        else:
+            # This is an int or other type
+            return self.data == 0
+
+    def position(self) -> int:
+        assert isinstance(self.data, int)
+        return self.data
+
+    def dim(self) -> Dim:
+        assert not isinstance(self.data, int)
+        return self.data
+
+    def __repr__(self) -> str:
+        return repr(self.data)
+
+
+def ndim_of_levels(levels: Sequence[DimEntry]) -> int:
+    r = 0
+    for l in levels:
+        if l.is_positional():
+            r += 1
+    return r
+
+
+def _match_levels(
+    tensor: torch.Tensor,
+    from_levels: list[DimEntry],
+    to_levels: list[DimEntry],
+    drop_levels: bool = False,
+) -> torch.Tensor:
+    """
+    Reshape a tensor to match target levels using as_strided.
+
+    Args:
+        tensor: Input tensor to reshape
+        from_levels: Current levels of the tensor
+        to_levels: Target levels to match
+        drop_levels: If True, missing dimensions are assumed to have stride 0
+
+    Returns:
+        Reshaped tensor
+    """
+    if from_levels == to_levels:
+        return tensor
+
+    sizes = tensor.size()
+    strides = tensor.stride()
+
+    if not drop_levels:
+        assert len(from_levels) <= len(to_levels), (
+            "Cannot expand dimensions without drop_levels"
+        )
+
+    new_sizes = []
+    new_strides = []
+
+    for level in to_levels:
+        # Find index of this level in from_levels
+        try:
+            idx = from_levels.index(level)
+        except ValueError:
+            # Level not found in from_levels
+            if level.is_positional():
+                new_sizes.append(1)
+            else:
+                new_sizes.append(level.dim().size)
+            new_strides.append(0)
+        else:
+            new_sizes.append(sizes[idx])
+            new_strides.append(strides[idx])
+
+    return tensor.as_strided(new_sizes, new_strides, tensor.storage_offset())
diff --git a/functorch/dim/_enable_all_layers.py b/functorch/dim/_enable_all_layers.py
new file mode 100644
index 0000000000000..b05c58b2c8439
--- /dev/null
+++ b/functorch/dim/_enable_all_layers.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+import torch
+
+from ._dim_entry import DimEntry
+
+
+if TYPE_CHECKING:
+    from . import Dim, Tensor
+
+
+class EnableAllLayers:
+    """
+    RAII-style context manager for enabling functorch vmap layers.
+    It manages the creation and cleanup of functorch dynamic layers.
+
+    This is probably one of the more algorithmically important parts of first
+    class dims. Intuitively, FCD can be thought of as another way of using
+    vmap, where you don't actually have to vmap at the top level, instead the
+    vmaps are implicitly determined by inspecting the bound dimensions on the
+    FCD tensors involved in a compute (this is similar to our concept of
+    non-lexical modes that we spent a long time talking about years ago). But
+    under the hood you still need to actually enable the vmap mode. So once
+    FCD has determined all of the dims we are batching over, it needs to
+    enable all those layers so functorch can actually apply the batching
+    rules. Therefore enable all layers!
+    """
+
+    levels_start: int
+    levels_to_dim: list[Dim]
+
+    def __init__(self, levels: list[DimEntry]):
+        """
+        Initialize and push dynamic layers for all first-class dimensions.
+
+        Args:
+            levels: List of dimension entries to create layers for
+        """
+
+        from . import Dim
+
+        self.levels_start = 0
+        self.levels_to_dim = []
+
+        for l in levels:
+            if not l.is_positional():
+                d = l.dim()
+                assert isinstance(d, Dim)
+                self.levels_to_dim.append(d)
+
+        # Sort by level for stable ordering
+        self.levels_to_dim.sort(key=lambda d: d._level)
+
+    def __enter__(self) -> EnableAllLayers:  # noqa: PYI034
+        # Create functorch dynamic layers
+        for i, dim in enumerate(self.levels_to_dim):
+            batch_size = dim.size
+            level = torch._C._functorch._vmap_increment_nesting(batch_size, "different")
+            if i == 0:
+                self.levels_start = level
+        return self
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        """Clean up dynamic layers in reverse order."""
+        to_remove = self.levels_start + len(self.levels_to_dim) - 1
+        for i in range(len(self.levels_to_dim)):
+            popped = torch._C._functorch._vmap_decrement_nesting()
+            assert popped == to_remove - i, (
+                f"Expected layer {to_remove - i}, got {popped}"
+            )
+
+    def from_batched(self, batchedtensor: torch.Tensor, has_device: bool) -> Tensor:
+        """
+        Create a Tensor from a batched tensor by unwrapping functorch layers.
+
+        Args:
+            batchedtensor: Batched tensor from functorch operation
+            has_device: Whether tensor has device info
+
+        Returns:
+            Tensor with appropriate levels
+        """
+        # Create positional levels for base dimensions
+        levels: list[DimEntry] = []
+        for i in range(-batchedtensor.dim(), 0):
+            levels.append(DimEntry(i))
+
+        tensor = batchedtensor
+
+        while torch._C._functorch.is_batchedtensor(tensor):
+            level = torch._C._functorch.maybe_get_level(tensor)
+            assert level is not None
+            assert level >= self.levels_start and level < self.levels_start + len(
+                self.levels_to_dim
+            )
+            dim = DimEntry(self.levels_to_dim[level - self.levels_start])
+            bdim = torch._C._functorch.maybe_get_bdim(tensor)
+            assert bdim is not None
+            levels.insert(bdim, dim)
+            tensor = torch._C._functorch.get_unwrapped(tensor)
+
+        from . import Tensor
+
+        result = Tensor()
+        result._tensor = tensor
+        result._batchtensor = batchedtensor
+        result._has_device = has_device
+        result._levels = levels
+        return result
+
+    def inplace_update_layers(
+        self, batchtensor: torch.Tensor, levels: list[DimEntry]
+    ) -> None:
+        """
+        Update the levels of a batched tensor in place.
+
+        This requires the _maybe_unsafe_set_level binding that we'll add to functorch.
+
+        Args:
+            batchtensor: Batched tensor to update
+            levels: New levels to set
+        """
+        # Check if tensor is batched
+        if not torch._C._functorch.is_batchedtensor(batchtensor):
+            return
+
+        impl = batchtensor
+
+        for i in reversed(range(len(self.levels_to_dim))):
+            if impl is None:
+                break
+
+            if any(l == DimEntry(self.levels_to_dim[i]) for l in levels):
+                # This is very interesting!  The level on batch tensor is
+                # meaningless!  We set it RIGHT before we go into vmap
+                torch._C._functorch._maybe_unsafe_set_level(impl, self.levels_start + i)
+                impl = torch._C._functorch.get_unwrapped(impl)
diff --git a/functorch/dim/_getsetitem.py b/functorch/dim/_getsetitem.py
new file mode 100644
index 0000000000000..59e2f3c61e0b1
--- /dev/null
+++ b/functorch/dim/_getsetitem.py
@@ -0,0 +1,561 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Optional, TYPE_CHECKING, Union
+
+import torch
+
+from ._dim_entry import _match_levels, DimEntry
+from ._tensor_info import TensorInfo
+
+
+if TYPE_CHECKING:
+    from . import Dim
+
+
+def _safe_index(lst: list, item: Any) -> Optional[int]:
+    """
+    Helper function to find index of item in list.
+
+    For DimEntry objects, uses __eq__ comparison which properly handles
+    both positional and Dim entries.
+
+    Returns the index if found, None if not found.
+    """
+    for i, list_item in enumerate(lst):
+        # Use == for DimEntry objects as they have proper __eq__ implementation
+        if isinstance(item, DimEntry) and isinstance(list_item, DimEntry):
+            if list_item == item:
+                return i
+        elif list_item is item:
+            return i
+    return None
+
+
+@dataclass
+class IndexingInfo:
+    can_call_original: bool = False
+    advanced_indexing: bool = False
+    self_tensor: Optional[torch.Tensor] = None
+    flat_inputs: list[Any] = field(default_factory=list)
+    result_levels: list[DimEntry] = field(default_factory=list)
+    has_device: bool = False
+
+
+def has_dims(obj: Any) -> bool:
+    """
+    Check if an object has first-class dimensions.
+
+    This function checks if the object is either a Dim or a functorch Tensor
+    that has first-class dimensions, using the proper check_exact methods.
+    """
+    from . import Dim, Tensor
+
+    return Dim.check_exact(obj) or Tensor.check_exact(obj)
+
+
+def _bind_dims_to_size(sz: int, sd: int, dims: list, nsz: list, nsd: list) -> None:
+    """
+    Bind dimensions to size and calculate proper strides for dim packs.
+    """
+    from . import DimensionBindError
+
+    rhs_prod = 1
+    for i, dim in enumerate(dims):
+        if not dim.is_bound:
+            # Check for multiple unbound dimensions
+            for j in range(i + 1, len(dims)):
+                if not dims[j].is_bound:
+                    raise DimensionBindError(
+                        f"cannot infer the sizes of two dimensions at once {dim!r} and {dims[j]!r}"
+                    )
+                rhs_prod *= dims[j].size
+
+            # Calculate the size for this unbound dimension
+            if sz % rhs_prod != 0:
+                tup = tuple(dim.size if dim.is_bound else "?" for dim in dims)
+                raise DimensionBindError(
+                    f"inferred dimension does not evenly fit into larger dimension: {sz} vs {tup}"
+                )
+
+            inferred_size = sz // rhs_prod
+            dim.size = inferred_size
+            rhs_prod = sz
+            break
+        else:
+            rhs_prod *= dim.size
+
+    # Final validation that dimensions match
+    if rhs_prod != sz:
+        tup = tuple(dims)
+        raise DimensionBindError(
+            f"Dimension sizes to do not match ({sz} != {rhs_prod}) when matching dimension pack {tup}"
+        )
+
+    # Calculate new sizes and strides for each dimension in the pack
+    # First calculate all strides by iterating in reverse
+    new_strides = [0] * len(dims)
+    current_stride = sd
+    for i in reversed(range(len(dims))):
+        new_strides[i] = current_stride
+        current_stride *= dims[i].size
+
+    # Then append sizes and strides in forward order
+    for i in range(len(dims)):
+        nsz.append(dims[i].size)
+        nsd.append(new_strides[i])
+
+
+def slice_to_tuple(flat_inputs: list) -> tuple:
+    return tuple(flat_inputs)
+
+
+def extractIndices(index: Any, indices: list) -> bool:
+    if isinstance(index, tuple):  # mpy::tuple_view::check
+        indices.extend(index)
+        return True
+    elif isinstance(index, torch.Tensor):  # THPVariable_Check
+        indices.append(index)
+        return False
+    elif not hasattr(index, "__iter__") or isinstance(
+        index, (str, bytes)
+    ):  # !mpy::is_sequence
+        indices.append(index)
+        return False
+
+    # Handle sequence case (list)
+    if isinstance(index, list):
+        if len(index) >= 32:
+            indices.extend(index)
+            return True
+
+        # Check each item in the sequence
+        for item in index:
+            if (
+                isinstance(item, (torch.Tensor, slice))
+                or hasattr(item, "__iter__")
+                or item is ...
+                or item is None
+                or has_dims(item)
+            ):
+                indices.extend(index)
+                return True
+
+        # If we got here, treat as single index
+        indices.append(index)
+        return False
+
+    # Default case
+    indices.append(index)
+    return False
+
+
+def getitem(cls: Any, func: Any, types: Any, args: Any, kwargs: Any) -> Any:
+    self = args[0]
+    index = args[1]
+
+    iinfo = getsetitem(self, index, has_dims(self))
+    if iinfo.can_call_original:
+        # Call original tensor __getitem__ directly, bypassing __torch_function__
+        return torch.Tensor.__getitem__(self, index)
+
+    return invoke_getitem(iinfo)
+
+
+def setitem(self: Any, index: Any, rhs: Any) -> None:
+    """Set values in tensor using first-class dimensions."""
+    from . import DimensionBindError, TensorInfo
+
+    iinfo = getsetitem(self, index, has_dims(self) or has_dims(rhs))
+
+    if iinfo.can_call_original:
+        # Call original tensor __setitem__ directly, bypassing __torch_function__
+        torch._C.TensorBase.__setitem__(self, index, rhs)
+        return
+
+    # Handle RHS tensor with dimensions
+    rhs_info = TensorInfo.create(rhs, False, False)
+
+    if rhs_info:
+        # Check that rhs dimensions are compatible with result dimensions
+        for l in rhs_info.levels:
+            if not l.is_positional():
+                # Find this dimension in result levels
+                found = False
+                for result_level in iinfo.result_levels:
+                    if (
+                        not result_level.is_positional()
+                        and result_level.dim() is l.dim()
+                    ):
+                        found = True
+                        break
+
+                if not found:
+                    # Create tuple representation of result levels for error message
+                    result_dims: list[Union[int, Dim]] = []
+                    for rl in iinfo.result_levels:
+                        if rl.is_positional():
+                            result_dims.append(rl.position())
+                        else:
+                            result_dims.append(rl.dim())
+
+                    raise DimensionBindError(
+                        f"rhs of setitem contains dimension {l.dim()!r} which is not in the dimension on the left "
+                        f"({tuple(result_dims)!r})"
+                    )
+
+        # Match RHS tensor to result levels
+        assert rhs_info.tensor is not None, "Cannot match levels on None tensor"
+        matched_rhs = _match_levels(
+            rhs_info.tensor, rhs_info.levels, iinfo.result_levels
+        )
+    else:
+        matched_rhs = rhs
+
+    # For advanced indexing with dimensions, we need special handling
+    if iinfo.advanced_indexing:
+        # Use advanced indexing - the flat_inputs already contain matched tensors
+        tup = slice_to_tuple(iinfo.flat_inputs)
+        if iinfo.self_tensor is None:
+            raise RuntimeError("Cannot setitem on None tensor")
+        torch._C.TensorBase.__setitem__(iinfo.self_tensor, tup, matched_rhs)
+    else:
+        # Simple copy operation
+        if iinfo.self_tensor is None:
+            raise RuntimeError("Cannot copy to None tensor")
+        iinfo.self_tensor.copy_(matched_rhs)
+
+
+def invoke_getitem(iinfo: IndexingInfo) -> Any:
+    if iinfo.advanced_indexing:
+        self_tensor = iinfo.self_tensor
+        tup = slice_to_tuple(iinfo.flat_inputs)
+        if self_tensor is None:
+            raise RuntimeError("Cannot getitem on None tensor")
+        rtensor = self_tensor[tup]
+    else:
+        rtensor = iinfo.self_tensor  # type: ignore[assignment]
+        if rtensor is None:
+            raise RuntimeError("Cannot getitem on None tensor")
+        # rtensor is now guaranteed to be not None
+
+    # Create a Tensor with the proper dimensions using the class method
+    from . import Tensor
+
+    return Tensor.from_positional(rtensor, iinfo.result_levels, iinfo.has_device)
+
+
+def getsetitem(self: Any, index: Any, tensors_have_dims: bool) -> IndexingInfo:
+    from . import DimList  # Import DimList for type checking
+
+    can_call_original_getitem = not tensors_have_dims
+
+    input_list = []
+    if has_dims(index):
+        input_list.append(index)
+    else:
+        is_sequence = extractIndices(index, input_list)
+        # nothing about first class dims here, fallback to getitem
+        if can_call_original_getitem and not is_sequence:
+            return IndexingInfo(can_call_original=True)
+
+    # Calculate how many dimensions have been indexed in order to compute the
+    # size of ... or expand a potentially unbound dimension list.
+    dims_indexed = 0
+    expanding_object = -1
+    unbound_dim_list = None
+    dimlists = []  # Track DimList positions for later processing
+
+    def check_expanding(i: int) -> None:
+        nonlocal expanding_object
+        if expanding_object != -1:
+            from . import DimensionBindError
+
+            raise DimensionBindError(
+                f"at most one ... or unbound dimension list can exist in indexing list but found 2 at offsets "
+                f"{expanding_object} and {i}"
+            )
+        expanding_object = i
+
+    def is_dimpack(s: Any) -> bool:
+        from . import Dim
+
+        return (
+            isinstance(s, (tuple, list))
+            and len(s) > 0
+            and all(Dim.check_exact(item) for item in s)
+        )
+
+    has_dimpacks_or_none = False
+    for i, s in enumerate(input_list):
+        if has_dims(s):
+            can_call_original_getitem = False
+            dims_indexed += 1
+        elif s is ...:
+            check_expanding(i)
+        elif isinstance(s, DimList):
+            can_call_original_getitem = False
+            if not s.is_bound:
+                check_expanding(i)
+                unbound_dim_list = s
+            else:
+                dims_indexed += len(s._dims)
+            dimlists.append(i)
+        elif s is None:
+            has_dimpacks_or_none = True
+        elif is_dimpack(s):
+            can_call_original_getitem = False
+            has_dimpacks_or_none = True
+            dims_indexed += 1
+        else:
+            dims_indexed += 1
+
+    # Early return if we can use original getitem
+    if can_call_original_getitem:
+        return IndexingInfo(can_call_original=True)
+
+    self_info = TensorInfo.create(self, False, True)
+    total_dims = len(self_info.levels)  # Total dimensions (positional + named)
+    if dims_indexed > total_dims:
+        raise ValueError(
+            f"at least {dims_indexed} indices were supplied but the tensor only has {total_dims} dimensions"
+        )
+
+    # Expand any unbound dimension list, or expand ... into individual : slices.
+    expanding_dims = total_dims - dims_indexed
+    if expanding_object != -1:
+        if unbound_dim_list is not None:
+            # Bind unbound dimension list to the expanding dimensions
+            unbound_dim_list.bind_len(expanding_dims)
+        else:
+            # Expand ... into slice(None) objects
+            no_slices = [slice(None)] * expanding_dims
+            input_list = (
+                input_list[:expanding_object]
+                + no_slices
+                + input_list[expanding_object + 1 :]
+            )
+
+    # Flatten out any dimensions stored in dimlist elements directly into the inputs
+    # Process in reverse order to maintain indices
+    for i in range(len(dimlists) - 1, -1, -1):
+        idx = dimlists[i]
+
+        # We added more elements to input because of ...
+        # so we need to also adjust the index to get back to where the
+        # dimlist existed
+        if (
+            unbound_dim_list is None
+            and expanding_object != -1
+            and idx > expanding_object
+        ):
+            idx += expanding_dims
+
+        dl = input_list[idx]
+
+        # PRIVATE here naughty
+        input_list = input_list[:idx] + dl._dims + input_list[idx + 1 :]
+
+    return getsetitem_flat(self_info, input_list, [], [], has_dimpacks_or_none)
+
+
+def getsetitem_flat(
+    self_info: TensorInfo,
+    input_list: list,
+    keys: list[DimEntry],
+    values: list,
+    has_dimpacks_or_none: bool,
+) -> IndexingInfo:
+    from . import Dim
+
+    # Track dimension usage
+    seen_dims: list[Any] = []
+    seen_dims_nuses: list[int] = []
+
+    def add_dim(dim: Any) -> None:
+        # Use safe indexing to avoid triggering __torch_function__ on Dim objects
+        idx = _safe_index(seen_dims, dim)
+        if idx is not None:
+            seen_dims_nuses[idx] += 1
+        else:
+            seen_dims.append(dim)
+            seen_dims_nuses.append(1)
+
+    flat_inputs = []
+    tensor_inputs: list[Any] = []
+    device_holding_tensor = None
+
+    def append_flat_handle(handle: Any) -> None:
+        flat_inputs.append(handle)
+        tensor_inputs.append(None)
+
+    def append_tensor_input(ti: TensorInfo) -> None:
+        flat_inputs.append(None)
+        tensor_inputs.append(ti)
+        nonlocal device_holding_tensor
+        if ti.has_device and device_holding_tensor is None:
+            device_holding_tensor = ti.tensor
+
+    nsz = []
+    nsd = []
+    if self_info.tensor is None:
+        raise RuntimeError("Cannot get size/stride on None tensor")
+    sz = self_info.tensor.size()
+    sd = self_info.tensor.stride()
+
+    def append_size(i: int) -> None:
+        if has_dimpacks_or_none:
+            nsz.append(sz[i])
+            nsd.append(sd[i])
+
+    input_it = input_list[:]
+
+    def parse_nones() -> None:
+        nonlocal input_it
+        while input_it and input_it[0] is None:
+            append_flat_handle(slice(None))
+            nsz.append(1)
+            nsd.append(0)
+            input_it = input_it[1:]
+
+    def append_item(i: int, arg: Any) -> None:
+        if Dim.check_exact(arg):
+            d = arg
+            if d._size == -1:
+                d.size = sz[i]
+            add_dim(d)
+            append_size(i)
+            append_flat_handle(arg)
+            return
+
+        info = TensorInfo.create(arg, False, False)
+        if info:
+            append_size(i)
+            append_tensor_input(info)
+            for level in info.levels:
+                if not level.is_positional():
+                    add_dim(level.dim())
+            return
+
+        if has_dimpacks_or_none:
+            if isinstance(arg, (tuple, list)) and all(Dim.check_exact(d) for d in arg):
+                # dim pack
+                dim_pack = list(arg)
+                for d in dim_pack:
+                    add_dim(d)
+                    append_flat_handle(d)
+                _bind_dims_to_size(sz[i], sd[i], dim_pack, nsz, nsd)
+                return
+
+        append_size(i)
+        append_flat_handle(arg)
+
+    # Match indexing expressions with tensor dimensions
+    for i, level in enumerate(self_info.levels):
+        # Use safe indexing to avoid triggering __torch_function__ on DimEntry comparisons
+        idx = _safe_index(keys, level)
+        if idx is not None:
+            append_item(i, values[idx])
+        else:
+            if level.is_positional():
+                parse_nones()
+                if not input_it:
+                    append_flat_handle(slice(None))
+                    append_size(i)
+                else:
+                    arg = input_it[0]
+                    input_it = input_it[1:]
+                    append_item(i, arg)
+            else:
+                add_dim(level.dim())
+                append_flat_handle(level.dim())
+                append_size(i)
+
+    parse_nones()
+
+    # Restride tensor if needed
+    if has_dimpacks_or_none and nsz:
+        if self_info.tensor is None:
+            raise RuntimeError("Cannot restride None tensor")
+        self_tensor = self_info.tensor.as_strided(
+            nsz, nsd, self_info.tensor.storage_offset()
+        )
+    else:
+        self_tensor = self_info.tensor
+
+    # Determine result shape and indexing requirements
+    result_levels: list[Any] = []
+    index_levels = []
+    tensor_insert_point = -1
+    requires_getindex = False
+
+    def mark_tensor_index() -> None:
+        nonlocal tensor_insert_point
+        if tensor_insert_point == -1:
+            tensor_insert_point = len(result_levels)
+        elif tensor_insert_point != len(result_levels):
+            tensor_insert_point = 0
+
+    for i, inp in enumerate(flat_inputs):
+        if tensor_inputs[i] is not None:
+            requires_getindex = True
+            mark_tensor_index()
+            for level in tensor_inputs[i].levels:
+                if level not in index_levels:
+                    index_levels.append(level)
+        elif Dim.check_exact(inp):
+            d = inp
+            # Use safe indexing to avoid triggering __torch_function__
+            dim_idx = _safe_index(seen_dims, d)
+            assert dim_idx is not None, f"Dim {d} not found in seen_dims"
+            if seen_dims_nuses[dim_idx] == 1:
+                flat_inputs[i] = slice(None)
+                result_levels.append(DimEntry(d))
+            else:
+                requires_getindex = True
+                flat_inputs[i] = None
+                tensor_inputs[i] = TensorInfo(
+                    d._get_range(), [DimEntry(d)], False, None
+                )
+                if DimEntry(d) not in index_levels:
+                    index_levels.append(DimEntry(d))
+                mark_tensor_index()
+        else:
+            if inp != slice(None):
+                requires_getindex = True
+            if not isinstance(inp, int):
+                result_levels.append(DimEntry(-1))
+
+    # Insert indexing dimensions at first tensor use point
+    if tensor_insert_point != -1:
+        for level in reversed(index_levels):
+            result_levels.insert(tensor_insert_point, level)
+
+    # Match tensors to indexing shape
+    if requires_getindex:
+        for i in range(len(flat_inputs)):
+            if tensor_inputs[i] is not None:
+                t = tensor_inputs[i].tensor
+                assert t is not None, "TensorInfo should have valid tensor data"
+                if (
+                    not tensor_inputs[i].has_device
+                    and device_holding_tensor is not None
+                ):
+                    t = t.to(device_holding_tensor.device)
+                flat_inputs[i] = _match_levels(t, tensor_inputs[i].levels, index_levels)
+
+    # Number positional dimensions correctly
+    seen_positionals = 0
+    for i in reversed(range(len(result_levels))):
+        if result_levels[i].is_positional():
+            seen_positionals += 1
+            result_levels[i] = DimEntry(-seen_positionals)
+
+    return IndexingInfo(
+        can_call_original=False,
+        advanced_indexing=requires_getindex,
+        self_tensor=self_tensor,
+        flat_inputs=flat_inputs,
+        result_levels=result_levels,
+        has_device=self_info.has_device,
+    )
diff --git a/functorch/dim/_order.py b/functorch/dim/_order.py
new file mode 100644
index 0000000000000..baa0f82e4b2a2
--- /dev/null
+++ b/functorch/dim/_order.py
@@ -0,0 +1,214 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING, Union
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+import torch  # noqa: TC002
+
+from ._dim_entry import _match_levels, DimEntry, ndim_of_levels
+
+
+def _wrap_dim(arg: Any, orig_ndim: int, allow_none: bool = True) -> DimEntry:
+    """
+    Convert various dimension representations to DimEntry.
+
+    Args:
+        arg: The argument to convert (Dim, int, or other)
+        orig_ndim: Original number of dimensions
+        allow_none: Whether to allow None values
+
+    Returns:
+        DimEntry representation of the dimension
+    """
+    from . import Dim
+
+    if arg is None and allow_none:
+        return DimEntry()  # None entry
+    elif isinstance(arg, Dim):
+        return DimEntry(arg)
+    elif isinstance(arg, int):
+        if arg < 0:
+            pos = arg
+        else:
+            pos = arg - orig_ndim
+        return DimEntry(pos)
+    else:
+        return DimEntry()
+
+
+def order(
+    tensor_or_dim: Union[torch.Tensor, Any], *dims: Union[Any, Sequence[Any]]
+) -> torch.Tensor:
+    """
+    Reorder the dimensions of a tensor or create a tensor from a dimension.
+
+    It allows reordering tensor dimensions using first-class dimensions and
+    positional indices.
+
+    Args:
+        tensor_or_dim: Input tensor with first-class dimensions, or a Dim object
+        *dims: Dimensions or sequences of dimensions specifying the new order
+
+    Returns:
+        Tensor with reordered dimensions
+
+    Examples:
+        >>> import torch
+        >>> from functorch.dim import dims
+        >>> batch, channel, height, width = dims(4)
+        >>> x = torch.randn(2, 3, 4, 5)[batch, channel, height, width]
+        >>> # Reorder to [height, width, batch, channel]
+        >>> y = order(x, height, width, batch, channel)
+    """
+    from . import Dim, DimList, Tensor
+
+    # Handle first argument - tensor or dimension
+    if isinstance(tensor_or_dim, Tensor):
+        # First-class tensor
+        orig_levels = tensor_or_dim._levels[:]
+        data = tensor_or_dim._tensor
+        has_device = tensor_or_dim._has_device
+    elif isinstance(tensor_or_dim, Dim):
+        # Single dimension - create range tensor
+        orig_levels = [DimEntry(tensor_or_dim)]
+        data = tensor_or_dim._get_range()
+        has_device = False
+    else:
+        raise ValueError("First argument must be a Tensor or Dim object")
+
+    flat_positional_dims = []
+    to_flatten = []  # List of (start_index, length) pairs for flattening
+    levels = orig_levels[:]
+
+    orig_ndim = ndim_of_levels(levels)
+
+    def append_dim(d: DimEntry) -> None:
+        """Add a dimension to the reordering, removing it from available levels."""
+        try:
+            idx = levels.index(d)
+        except ValueError:
+            idx = None
+        if idx is None:
+            if d.is_positional():
+                raise ValueError(
+                    f"tensor has {orig_ndim} positional dimensions, but {d.position() + orig_ndim} specified, "
+                    f"or it was specified twice"
+                )
+            else:
+                raise ValueError(
+                    f"tensor does not contain dim {d.dim()} or it was specified twice"
+                )
+
+        levels[idx] = DimEntry()
+        flat_positional_dims.append(d)
+
+    n_new_positional = 0
+
+    # Process each dimension argument
+    for arg in dims:
+        entry = _wrap_dim(arg, orig_ndim, False)
+        if not entry.is_none():
+            append_dim(entry)
+            n_new_positional += 1
+        elif isinstance(arg, DimList):
+            # Handle DimList
+            for dim in arg._dims:
+                append_dim(DimEntry(dim))
+                n_new_positional += 1
+        else:
+            # Handle sequences of dimensions for flattening
+            n_new_positional += 1
+            if not hasattr(arg, "__iter__"):
+                raise ValueError("expected a Dim, List[Dim], or Sequence[Dim]")
+
+            # Convert to list to get length
+            seq = list(arg)
+            to_flatten.append((len(flat_positional_dims), len(seq)))
+
+            for item in seq:
+                entry = _wrap_dim(item, orig_ndim, False)
+                if entry.is_none():
+                    raise ValueError("expected a Dim or int")
+                append_dim(entry)
+
+    # Build new level ordering
+    insert_point = -1
+    new_levels: list[DimEntry] = []
+
+    # Add remaining (non-reordered) levels, finding insertion point for new dimensions
+    for level in levels:
+        if level.is_none():
+            continue
+        if level.is_positional():
+            if insert_point == -1:
+                insert_point = len(new_levels)
+                new_levels.extend(flat_positional_dims)
+        new_levels.append(level)
+
+    # If no positional dimensions found, append new dims at the end
+    if insert_point == -1:
+        insert_point = len(new_levels)
+        new_levels.extend(flat_positional_dims)
+
+    # Match tensor to new level structure
+    assert data is not None, "Cannot reorder None tensor"
+    ndata = _match_levels(data, orig_levels, new_levels)
+
+    # Handle dimension flattening if requested
+    if to_flatten:
+        # Now build the reshape target
+        view_shape = []
+        sizes = ndata.size()
+
+        # Add dimensions before the reordered ones
+        for i in range(insert_point):
+            view_shape.append(sizes[i])
+
+        # Process flattening groups
+        i = 0
+        for start_idx, length in to_flatten:
+            # Add individual dims before this flattening group
+            while i < start_idx:
+                view_shape.append(sizes[insert_point + i])
+                i += 1
+
+            # Flatten the group
+            new_size = 1
+            for j in range(length):
+                new_size *= sizes[insert_point + i + j]
+            view_shape.append(new_size)
+            i += length
+
+        # Add remaining individual dims
+        while i < len(flat_positional_dims):
+            view_shape.append(sizes[insert_point + i])
+            i += 1
+
+        # Add dimensions after the reordered ones
+        for i in range(insert_point + len(flat_positional_dims), len(levels)):
+            view_shape.append(sizes[i])
+
+        # Update levels by removing flattened dimensions
+        n_to_remove = len(flat_positional_dims) - n_new_positional
+        if n_to_remove > 0:
+            # Remove flattened levels
+            new_levels = (
+                new_levels[:insert_point] + new_levels[insert_point + n_to_remove :]
+            )
+
+        ndata = ndata.reshape(view_shape)
+
+    # Renumber positional dimensions (negative indexing from the right)
+    seen = 0
+    for i in range(len(new_levels) - 1, -1, -1):
+        if new_levels[i].is_positional() or (
+            i >= insert_point and i < insert_point + n_new_positional
+        ):
+            seen -= 1
+            new_levels[i] = DimEntry(seen)
+
+    result = Tensor.from_positional(ndata, new_levels, has_device)
+    return result  # type: ignore[return-value]
diff --git a/functorch/dim/_py_inst_decoder.py b/functorch/dim/_py_inst_decoder.py
new file mode 100644
index 0000000000000..7f08ebb8557fb
--- /dev/null
+++ b/functorch/dim/_py_inst_decoder.py
@@ -0,0 +1,67 @@
+import dis
+from typing import Any, Optional
+
+
+class _PyInstDecoder:
+    """
+    Decodes Python bytecode instructions to extract variable names
+    """
+
+    def __init__(self, code_object: Any, lasti: int) -> None:
+        self.code_object = code_object
+        self.instructions = list(dis.get_instructions(code_object))
+        self.offset = self._find_instruction_index(lasti)
+
+    def _find_instruction_index(self, lasti: int) -> int:
+        """Find instruction index corresponding to lasti (byte offset)."""
+        # Find the instruction at or before lasti
+        # This should find the CALL instruction, not the next one
+        best_idx = 0
+        for i, instr in enumerate(self.instructions):
+            if instr.offset <= lasti:
+                best_idx = i
+            else:
+                break
+        return best_idx
+
+    def next(self) -> None:
+        """Advance to the next instruction."""
+        self.offset += 1
+
+    def opcode(self) -> Optional[str]:
+        """Get the opcode name of the current instruction."""
+        if self.offset < len(self.instructions):
+            return self.instructions[self.offset].opname
+        return None
+
+    def oparg(self) -> int:
+        """Get the argument of the current instruction."""
+        if self.offset < len(self.instructions):
+            return self.instructions[self.offset].arg or 0
+        return 0
+
+    def name(self) -> Optional[str]:
+        """
+        Extract variable name from current instruction.
+        """
+        opname = self.opcode()
+        if not opname:
+            return None
+
+        names = None
+        if opname in ("STORE_NAME", "STORE_GLOBAL"):
+            names = self.code_object.co_names
+        elif opname == "STORE_FAST":
+            names = self.code_object.co_varnames
+        elif opname == "STORE_DEREF":
+            names = self.code_object.co_cellvars
+            if not names:
+                names = self.code_object.co_freevars
+        else:
+            return None
+
+        arg = self.oparg()
+        if names and 0 <= arg < len(names):
+            return names[arg]
+
+        return None
diff --git a/functorch/dim/_tensor_info.py b/functorch/dim/_tensor_info.py
new file mode 100644
index 0000000000000..1e2513e36c058
--- /dev/null
+++ b/functorch/dim/_tensor_info.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Optional, TYPE_CHECKING
+
+import torch
+
+
+if TYPE_CHECKING:
+    from ._dim_entry import DimEntry
+
+
+@dataclass
+class TensorInfo:
+    tensor: Optional[torch.Tensor]
+    levels: list[DimEntry]
+    has_device: bool
+    batchedtensor: Optional[torch.Tensor]
+
+    def __post_init__(self) -> None:
+        from ._dim_entry import DimEntry
+
+        assert all(isinstance(l, DimEntry) for l in self.levels)
+
+    def ndim(self) -> int:
+        from ._dim_entry import ndim_of_levels
+
+        return ndim_of_levels(self.levels)
+
+    def __bool__(self) -> bool:
+        return self.tensor is not None
+
+    @staticmethod
+    def create(
+        h: Any, ensure_batched: bool = True, ensure_present: bool = True
+    ) -> TensorInfo:
+        from . import Dim, DimEntry, Tensor
+
+        if Tensor.check_exact(h):
+            # functorch Tensor with first-class dimensions
+            return TensorInfo(
+                h._get_tensor(),
+                h._get_levels(),
+                h._get_has_device(),
+                h._get_batchtensor() if ensure_batched else None,
+            )
+        elif Dim.check_exact(h):
+            # For Dim objects, only get range/batchtensor if needed and dimension is bound
+            tensor = h._get_range() if h.is_bound else None
+            batchtensor = (
+                h._get_batchtensor() if ensure_batched and h.is_bound else None
+            )
+            return TensorInfo(
+                tensor,
+                [DimEntry(h)],
+                False,
+                batchtensor,
+            )
+        elif isinstance(h, torch.Tensor):
+            # Plain torch tensor - create positional levels
+            levels = []
+            for i in range(-h.dim(), 0):
+                levels.append(DimEntry(i))
+            return TensorInfo(h, levels, True, h)
+        else:
+            if ensure_present:
+                raise ValueError("expected a tensor object")
+            return TensorInfo(None, [], False, None)
diff --git a/functorch/dim/_wrap.py b/functorch/dim/_wrap.py
new file mode 100644
index 0000000000000..4b359f6a1d588
--- /dev/null
+++ b/functorch/dim/_wrap.py
@@ -0,0 +1,263 @@
+"""
+Python implementation of function wrapping functionality for functorch.dim.
+"""
+
+from __future__ import annotations
+
+import functools
+from typing import Any, Callable, Optional
+
+import torch
+from torch.utils._pytree import tree_map
+
+from ._dim_entry import DimEntry
+from ._enable_all_layers import EnableAllLayers
+from ._tensor_info import TensorInfo
+
+
+def handle_from_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    """Handle tensor conversion for torch function integration."""
+    return tensor
+
+
+class WrappedOperator:
+    """
+    This class wraps PyTorch operations to support first-class dimensions.
+    """
+
+    def __init__(
+        self, orig: Callable, wrapper_implementation: Callable, dim_name: str = "dim"
+    ):
+        self.orig = orig
+        self.wrapper_implementation = wrapper_implementation
+        self.name = getattr(orig, "__name__", "")
+        self.doc = getattr(orig, "__doc__", None)
+        self.dim_name = dim_name
+
+        self.is_pointwise = False
+        self.dim_offset = 0
+        self.keepdim_offset = 1
+        self.single_dim = False
+        self.reduce = True
+
+        # Update docstring if we have a dim_name
+        if self.doc and self.dim_name:
+            self.doc = f"{self.doc}\nArgument '{self.dim_name}' can be either an integer or a torchdim.Dim object.\n"
+
+    def function(self) -> Callable:
+        """Create a wrapped function that calls our wrapper implementation."""
+
+        def wrapped_func(*args: Any, **kwargs: Any) -> Any:
+            return self.wrapper_implementation(self, *args, **kwargs)
+
+        # Copy metadata using functools.update_wrapper for just __name__ and __doc__
+        functools.update_wrapper(
+            wrapped_func, self.orig, assigned=("__name__",), updated=()
+        )
+        wrapped_func.__doc__ = self.doc
+
+        return wrapped_func
+
+
+def _wrap_dim(dim: Any, ndim: int, keepdim: bool = False) -> DimEntry:
+    """Convert single dimension specification to DimEntry object."""
+    from . import Dim
+
+    if isinstance(dim, Dim):
+        if keepdim:
+            raise ValueError("cannot preserve first-class dimensions with keepdim=True")
+        return DimEntry(dim)
+    elif isinstance(dim, int):
+        i = dim
+        while i >= 0:
+            i -= ndim
+        return DimEntry(i)
+    else:
+        return DimEntry()
+
+
+def _wrap_dims(dim: Any, ndim: int, keepdim: bool = False) -> list[DimEntry]:
+    """Convert dimension specification to list of DimEntry objects."""
+    de = _wrap_dim(dim, ndim, keepdim)
+    result = []
+    if not de.is_none():
+        result.append(de)
+    else:
+        for d in dim:
+            result.append(_wrap_dim(d, ndim, keepdim))
+    return result
+
+
+def patched_dim_method(wrapper: WrappedOperator, *args: Any, **kwargs: Any) -> Any:
+    """
+    This is the core method that handles dimension-aware operations.
+    """
+    if not args:
+        raise ValueError("Expected at least one argument (self)")
+
+    # Get dimension argument
+    dim_arg = kwargs.get(wrapper.dim_name)
+    if dim_arg is None and wrapper.dim_offset < len(args):
+        # Try to get dim from positional args (accounting for self at index 0)
+        dim_idx = wrapper.dim_offset + 1
+        if dim_idx < len(args):
+            dim_arg = args[dim_idx]
+
+    # If no dimension argument provided, fall back to standard functorch handling
+    if dim_arg is None:
+        info = TensorInfo.create(args[0], ensure_batched=True, ensure_present=False)
+        if not info:
+            return wrapper.orig(*args, **kwargs)
+
+        with EnableAllLayers(info.levels) as guard:
+            assert info.batchedtensor is not None
+            guard.inplace_update_layers(info.batchedtensor, info.levels)
+            new_args = list(args)
+            new_args[0] = handle_from_tensor(info.batchedtensor)
+            result = wrapper.orig(*new_args, **kwargs)
+            return guard.from_batched(result, info.has_device)
+
+    # Handle dimension-aware operation
+    info = TensorInfo.create(args[0])
+    if not info:
+        return wrapper.orig(*args, **kwargs)
+
+    # Check for keepdim parameter
+    keepdim = False
+    if wrapper.reduce:
+        keepdim_arg = kwargs.get("keepdim")
+        if keepdim_arg is None and wrapper.keepdim_offset < len(args):
+            keepdim_idx = wrapper.keepdim_offset + 1
+            if keepdim_idx < len(args):
+                keepdim_arg = args[keepdim_idx]
+        if keepdim_arg is not None:
+            keepdim = bool(keepdim_arg)
+
+    # Wrap dimensions
+    ndim = info.ndim()
+    dims = _wrap_dims(dim_arg, ndim, keepdim)
+
+    # Convert dimensions to indices and validate
+    dim_indices: list[int] = []
+    seen = [False] * len(info.levels)
+
+    for d in dims:
+        midx = None
+        for i, level in enumerate(info.levels):
+            if level == d:
+                midx = i
+                break
+
+        if midx is None:
+            # Try to match by position/name more flexibly
+            for i, level in enumerate(info.levels):
+                if hasattr(level, "matches") and level.matches(d):
+                    midx = i
+                    break
+
+            if midx is None:
+                level_strs = [str(level) for level in info.levels]
+                raise ValueError(
+                    f"Tensor with dimensions {level_strs} does not contain {d}"
+                )
+
+        seen[midx] = True
+        dim_indices.append(midx)
+
+    # Determine new levels after reduction
+    new_levels = []
+    if wrapper.reduce and not keepdim:
+        for i, level in enumerate(info.levels):
+            if not seen[i]:
+                new_levels.append(level)
+    else:
+        new_levels = info.levels[:]
+
+    # Create dimension indices for the original function
+    if len(dim_indices) == 1:
+        py_indices: Any = dim_indices[0]
+    else:
+        py_indices = tuple(dim_indices)
+
+    # Update arguments
+    new_args = list(args)
+    new_kwargs = kwargs.copy()
+    assert info.tensor is not None
+    new_args[0] = handle_from_tensor(info.tensor)
+
+    # Update dimension argument
+    if wrapper.dim_name in new_kwargs:
+        new_kwargs[wrapper.dim_name] = py_indices
+    else:
+        dim_idx = wrapper.dim_offset + 1
+        if dim_idx < len(new_args):
+            new_args = list(new_args)
+            new_args[dim_idx] = py_indices
+
+    # Call original function
+    result = wrapper.orig(*new_args, **new_kwargs)
+
+    # Wrap results
+    def wrap_result(obj: Any) -> Any:
+        if isinstance(obj, torch.Tensor):
+            from . import Tensor
+
+            return Tensor.from_positional(obj, new_levels, info.has_device)
+        return obj
+
+    return tree_map(wrap_result, result)
+
+
+def _wrap(
+    orig: Callable,
+    dim_offset: Optional[int] = None,
+    keepdim_offset: Optional[int] = None,
+    dim_name: Optional[str] = None,
+    single_dim: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+) -> Callable:
+    """
+    Wrap a PyTorch function to support first-class dimensions.
+
+    Args:
+        orig: Original function to wrap
+        dim_offset: Offset for dimension argument (default: 0)
+        keepdim_offset: Offset for keepdim argument (default: 1)
+        dim_name: Name of dimension parameter (default: "dim")
+        single_dim: Whether function takes single dimension (default: False)
+        reduce: Whether function reduces dimensions (default: True)
+    """
+    dim_name = dim_name or "dim"
+
+    wrapper = WrappedOperator(orig, patched_dim_method, dim_name)
+
+    if dim_offset is not None:
+        wrapper.dim_offset = dim_offset
+    if keepdim_offset is not None:
+        wrapper.keepdim_offset = keepdim_offset
+    if single_dim is not None:
+        wrapper.single_dim = single_dim
+    if reduce is not None:
+        wrapper.reduce = reduce
+
+    return wrapper.function()
+
+
+def call_torch_function(
+    wrapper: WrappedOperator,
+    func: Callable,
+    types: tuple,
+    args: tuple = (),
+    kwargs: Optional[dict] = None,
+) -> Any:
+    """
+    Handle __torch_function__ calls for wrapped operators.
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    # Import here to avoid circular imports
+    from . import _Tensor
+
+    # Use the torch function mechanism from _Tensor
+    return _Tensor.__torch_function__(func, types, args, kwargs)
diff --git a/functorch/dim/magic_trace.py b/functorch/dim/magic_trace.py
index 5c962a898ca79..d3be42cd5514c 100644
--- a/functorch/dim/magic_trace.py
+++ b/functorch/dim/magic_trace.py
@@ -6,11 +6,14 @@
 import os
 import signal
 import subprocess
+from collections.abc import Generator
 from contextlib import contextmanager
 
 
 @contextmanager
-def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"):
+def magic_trace(
+    output: str = "trace.fxt", magic_trace_cache: str = "/tmp/magic-trace"
+) -> Generator[None, None, None]:
     pid = os.getpid()
     if not os.path.exists(magic_trace_cache):
         print(f"Downloading magic_trace to: {magic_trace_cache}")
@@ -26,6 +29,7 @@ def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"):
         subprocess.run(["chmod", "+x", magic_trace_cache])
     args = [magic_trace_cache, "attach", "-pid", str(pid), "-o", output]
     p = subprocess.Popen(args, stderr=subprocess.PIPE, encoding="utf-8")
+    assert p.stderr is not None
     while True:
         x = p.stderr.readline()
         print(x)
@@ -36,7 +40,8 @@ def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"):
     finally:
         p.send_signal(signal.SIGINT)
         r = p.wait()
-        print(p.stderr.read())
-        p.stderr.close()
+        if p.stderr is not None:
+            print(p.stderr.read())
+            p.stderr.close()
         if r != 0:
             raise ValueError(f"magic_trace exited abnormally: {r}")
diff --git a/functorch/dim/tree_map.py b/functorch/dim/tree_map.py
deleted file mode 100644
index 3d2eae0582c85..0000000000000
--- a/functorch/dim/tree_map.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from functorch._C import dim
-
-
-tree_flatten = dim.tree_flatten
-
-
-def tree_map(fn, tree):
-    vs, unflatten = tree_flatten(tree)
-    return unflatten(fn(v) for v in vs)
diff --git a/functorch/dim/wrap_type.py b/functorch/dim/wrap_type.py
index b9ebda47c4cfe..cf4a195f3c74a 100644
--- a/functorch/dim/wrap_type.py
+++ b/functorch/dim/wrap_type.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import functools
 from types import (
     BuiltinMethodType,
     FunctionType,
@@ -11,11 +12,8 @@
     MethodDescriptorType,
     WrapperDescriptorType,
 )
+from typing import Any, Callable
 
-from functorch._C import dim as _C
-
-
-_wrap_method = _C._wrap_method
 
 FUNC_TYPES = (
     FunctionType,
@@ -26,14 +24,24 @@
 PROPERTY_TYPES = (GetSetDescriptorType, property)
 
 
-def wrap_type(to_patch, pattern, __torch_function__):
-    wrap_method = _wrap_method
+def _py_wrap_method(orig: Callable, __torch_function__: Callable) -> Callable:
+    def impl(*args: Any, **kwargs: Any) -> Any:
+        return __torch_function__(orig, None, args, kwargs)
+
+    # Copy metadata using functools.update_wrapper for just __name__ and __doc__
+    functools.update_wrapper(impl, orig, assigned=("__name__", "__doc__"), updated=())
+
+    return impl
+
+
+def wrap_type(to_patch: Any, pattern: type, __torch_function__: Callable) -> None:
+    wrap_method = _py_wrap_method
 
-    all = {}
+    all: dict[str, Any] = {}
     for t in reversed(pattern.mro()[:-1]):  # skip object
         all.update(t.__dict__)
 
-    def wrap_attr(orig):
+    def wrap_attr(orig: Any) -> property:
         return property(wrap_method(orig.__get__, __torch_function__))
 
     for name, obj in all.items():
diff --git a/functorch/docs/source/conf.py b/functorch/docs/source/conf.py
index 4a47b74f81f37..749a8435c03bc 100644
--- a/functorch/docs/source/conf.py
+++ b/functorch/docs/source/conf.py
@@ -50,7 +50,7 @@
     "myst_nb",
 ]
 
-# sys.path.insert(0, os.path.abspath('./notebooks'))
+# sys.path.insert(0, os.path.abspath('./tutorials'))
 
 # build the templated autosummary files
 # autosummary_generate = True
@@ -131,7 +131,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ["notebooks/colab**", "notebooks/_src/**"]
+exclude_patterns = ["tutorials/colab**", "tutorials/_src/**"]
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
diff --git a/functorch/docs/source/index.rst b/functorch/docs/source/index.rst
index 5f988524fb0e3..894fb9995afe5 100644
--- a/functorch/docs/source/index.rst
+++ b/functorch/docs/source/index.rst
@@ -55,7 +55,7 @@ Check out our `whirlwind tour <whirlwind_tour>`_ or some of our tutorials mentio
    :caption: functorch: Getting Started
 
    install
-   notebooks/whirlwind_tour.ipynb
+   tutorials/whirlwind_tour.ipynb
    ux_limitations
 
 .. toctree::
@@ -70,9 +70,9 @@ Check out our `whirlwind tour <whirlwind_tour>`_ or some of our tutorials mentio
    :maxdepth: 1
    :caption: functorch Tutorials
 
-   notebooks/jacobians_hessians.ipynb
-   notebooks/ensembling.ipynb
-   notebooks/per_sample_grads.ipynb
-   notebooks/neural_tangent_kernels.ipynb
-   notebooks/aot_autograd_optimizations.ipynb
-   notebooks/minifier.ipynb
+   tutorials/jacobians_hessians.ipynb
+   tutorials/ensembling.ipynb
+   tutorials/per_sample_grads.ipynb
+   tutorials/neural_tangent_kernels.ipynb
+   tutorials/aot_autograd_optimizations.ipynb
+   tutorials/minifier.ipynb
diff --git a/functorch/docs/source/notebooks b/functorch/docs/source/notebooks
deleted file mode 120000
index d4082256dcfe3..0000000000000
--- a/functorch/docs/source/notebooks
+++ /dev/null
@@ -1 +0,0 @@
-../../notebooks/
\ No newline at end of file
diff --git a/functorch/notebooks/_src/plot_ensembling.py b/functorch/docs/source/tutorials/_src/plot_ensembling.py
similarity index 100%
rename from functorch/notebooks/_src/plot_ensembling.py
rename to functorch/docs/source/tutorials/_src/plot_ensembling.py
diff --git a/functorch/notebooks/_src/plot_jacobians_and_hessians.py b/functorch/docs/source/tutorials/_src/plot_jacobians_and_hessians.py
similarity index 100%
rename from functorch/notebooks/_src/plot_jacobians_and_hessians.py
rename to functorch/docs/source/tutorials/_src/plot_jacobians_and_hessians.py
diff --git a/functorch/notebooks/_src/plot_per_sample_gradients.py b/functorch/docs/source/tutorials/_src/plot_per_sample_gradients.py
similarity index 100%
rename from functorch/notebooks/_src/plot_per_sample_gradients.py
rename to functorch/docs/source/tutorials/_src/plot_per_sample_gradients.py
diff --git a/functorch/notebooks/aot_autograd_optimizations.ipynb b/functorch/docs/source/tutorials/aot_autograd_optimizations.ipynb
similarity index 99%
rename from functorch/notebooks/aot_autograd_optimizations.ipynb
rename to functorch/docs/source/tutorials/aot_autograd_optimizations.ipynb
index 9a2db0fa9d1c5..a09793259769b 100644
--- a/functorch/notebooks/aot_autograd_optimizations.ipynb
+++ b/functorch/docs/source/tutorials/aot_autograd_optimizations.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# AOT Autograd - How to use and optimize?\n",
     "\n",
-    "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/aot_autograd_optimizations.ipynb\">\n",
+    "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/main/functorch/docs/source/tutorials/aot_autograd_optimizations.ipynb\">\n",
     "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
     "</a>\n",
     "\n",
diff --git a/functorch/notebooks/ensembling.ipynb b/functorch/docs/source/tutorials/ensembling.ipynb
similarity index 99%
rename from functorch/notebooks/ensembling.ipynb
rename to functorch/docs/source/tutorials/ensembling.ipynb
index 1ecc8738b0b5f..e7e4857f1872d 100644
--- a/functorch/notebooks/ensembling.ipynb
+++ b/functorch/docs/source/tutorials/ensembling.ipynb
@@ -11,7 +11,7 @@
         "\n",
         "This example illustrates how to vectorize model ensembling using vmap.\n",
         "\n",
-        "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/ensembling.ipynb\">\n",
+        "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/main/functorch/docs/source/tutorials/ensembling.ipynb\">\n",
         "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
         "</a>\n",
         "\n",
diff --git a/functorch/notebooks/jacobians_hessians.ipynb b/functorch/docs/source/tutorials/jacobians_hessians.ipynb
similarity index 99%
rename from functorch/notebooks/jacobians_hessians.ipynb
rename to functorch/docs/source/tutorials/jacobians_hessians.ipynb
index 4acf2ec609ff3..4e2f7d0908afc 100644
--- a/functorch/notebooks/jacobians_hessians.ipynb
+++ b/functorch/docs/source/tutorials/jacobians_hessians.ipynb
@@ -5,7 +5,7 @@
       "source": [
         "# Jacobians, Hessians, hvp, vhp, and more: composing functorch transforms\n",
         "\n",
-        "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/jacobians_hessians.ipynb\">\n",
+        "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/main/functorch/docs/source/tutorials/jacobians_hessians.ipynb\">\n",
         "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
         "</a>\n",
         "\n",
diff --git a/functorch/notebooks/minifier.ipynb b/functorch/docs/source/tutorials/minifier.ipynb
similarity index 100%
rename from functorch/notebooks/minifier.ipynb
rename to functorch/docs/source/tutorials/minifier.ipynb
diff --git a/functorch/notebooks/neural_tangent_kernels.ipynb b/functorch/docs/source/tutorials/neural_tangent_kernels.ipynb
similarity index 99%
rename from functorch/notebooks/neural_tangent_kernels.ipynb
rename to functorch/docs/source/tutorials/neural_tangent_kernels.ipynb
index 9d041be909268..0d4704cedf450 100644
--- a/functorch/notebooks/neural_tangent_kernels.ipynb
+++ b/functorch/docs/source/tutorials/neural_tangent_kernels.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Neural Tangent Kernels\n",
     "\n",
-    "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/neural_tangent_kernels.ipynb\">\n",
+    "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/main/functorch/docs/source/tutorials/neural_tangent_kernels.ipynb\">\n",
     "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
     "</a>\n",
     "\n",
diff --git a/functorch/notebooks/per_sample_grads.ipynb b/functorch/docs/source/tutorials/per_sample_grads.ipynb
similarity index 99%
rename from functorch/notebooks/per_sample_grads.ipynb
rename to functorch/docs/source/tutorials/per_sample_grads.ipynb
index e2317351f7eb1..5d5fc7206e170 100644
--- a/functorch/notebooks/per_sample_grads.ipynb
+++ b/functorch/docs/source/tutorials/per_sample_grads.ipynb
@@ -9,7 +9,7 @@
       "source": [
         "# Per-sample-gradients\n",
         "\n",
-        "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/per_sample_grads.ipynb\">\n",
+        "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/main/functorch/docs/source/tutorials/per_sample_grads.ipynb\">\n",
         "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
         "</a>\n",
         "\n",
diff --git a/functorch/notebooks/whirlwind_tour.ipynb b/functorch/docs/source/tutorials/whirlwind_tour.ipynb
similarity index 99%
rename from functorch/notebooks/whirlwind_tour.ipynb
rename to functorch/docs/source/tutorials/whirlwind_tour.ipynb
index deae3418966ba..4e0236db40894 100644
--- a/functorch/notebooks/whirlwind_tour.ipynb
+++ b/functorch/docs/source/tutorials/whirlwind_tour.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Whirlwind Tour\n",
     "\n",
-    "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/whirlwind_tour.ipynb\">\n",
+    "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/main/functorch/docs/source/tutorials/whirlwind_tour.ipynb\">\n",
     "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
     "</a>\n",
     "\n",
diff --git a/functorch/einops/rearrange.py b/functorch/einops/rearrange.py
index d7d71f5103f94..473a438166683 100644
--- a/functorch/einops/rearrange.py
+++ b/functorch/einops/rearrange.py
@@ -4,7 +4,7 @@
 from typing import Callable, TYPE_CHECKING, Union
 
 import torch
-from functorch._C import dim as _C
+from functorch.dim import dims  # noqa: F401
 
 from ._parsing import (
     _ellipsis,
@@ -20,8 +20,6 @@
 
 __all__ = ["rearrange"]
 
-dims = _C.dims
-
 
 @functools.lru_cache(256)
 def _create_rearrange_callable(
diff --git a/functorch/op_analysis/gen_data.py b/functorch/op_analysis/gen_data.py
index 4fda25f9a4e28..5e874e2bb1177 100644
--- a/functorch/op_analysis/gen_data.py
+++ b/functorch/op_analysis/gen_data.py
@@ -33,7 +33,6 @@ def gen_data(special_op_lists, analysis_name):
     annotated_ops = {
         a.strip(): b.strip() for a, b in list(csv.reader(open("annotated_ops")))
     }
-    from collections import defaultdict
 
     uniq_ops = []
     uniq_names = set()
diff --git a/mypy-strict.ini b/mypy-strict.ini
index dddbb623047f7..11e520d9ad82b 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -6,7 +6,7 @@
 # files.
 
 [mypy]
-python_version = 3.9
+python_version = 3.10
 plugins = mypy_plugins/check_mypy_version.py, numpy.typing.mypy_plugin
 
 cache_dir = .mypy_cache/strict
diff --git a/pyproject.toml b/pyproject.toml
index afc5aba2ccd3c..321af034f854d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,7 +53,7 @@ dev = [
 name = "torch"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 # TODO: change to `license = "BSD-3-Clause"` and enable PEP 639 after pinning setuptools>=77
 # FIXME: As of 2025.06.20, it is hard to ensure the minimum version of setuptools in our CI environment.
 # TOML-table-based license deprecated in setuptools>=77, and the deprecation warning will be changed
@@ -74,7 +74,6 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules",
     "Programming Language :: C++",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -183,8 +182,13 @@ ignore = [
     "SIM117",
     "SIM118",
     "UP007", # keep-runtime-typing
+    "UP038", # Was removed from newer versions, results in slower code
     "UP045", # keep-runtime-typing
     "TC006",
+    # TODO: Remove Python-3.10 specific suppressions
+    "B905",
+    "UP035",
+    "FURB161",
 ]
 select = [
     "B",
@@ -278,7 +282,7 @@ keep-runtime-typing = true
     "PYI021", # docstring-in-stub
     "PYI053", # string-or-bytes-too-long
 ]
-"functorch/notebooks/**" = [
+"functorch/docs/source/tutorials/**" = [
     "F401",
 ]
 "test/export/**" = [
diff --git a/related_commits b/related_commits
index a4c1f27eccf59..2f096c311fdc2 100644
--- a/related_commits
+++ b/related_commits
@@ -1,10 +1,10 @@
 ubuntu|pytorch|apex|master|4b03581558a063754bc1c4c9656bf6444844568c|https://github.com/ROCm/apex
 centos|pytorch|apex|master|4b03581558a063754bc1c4c9656bf6444844568c|https://github.com/ROCm/apex
-ubuntu|pytorch|torchvision|main|a8dc530fbb96a66a68dfbf48743fb528df770d87|https://github.com/pytorch/vision
-centos|pytorch|torchvision|main|a8dc530fbb96a66a68dfbf48743fb528df770d87|https://github.com/pytorch/vision
+ubuntu|pytorch|torchvision|main|e3b5d3a8bf5e8636462fd8bce9897bccc690b2a0|https://github.com/pytorch/vision
+centos|pytorch|torchvision|main|e3b5d3a8bf5e8636462fd8bce9897bccc690b2a0|https://github.com/pytorch/vision
 ubuntu|pytorch|torchdata|main|92950795e0790eb74df995daf40b658e85fd2c9f|https://github.com/pytorch/data
 centos|pytorch|torchdata|main|92950795e0790eb74df995daf40b658e85fd2c9f|https://github.com/pytorch/data
-ubuntu|pytorch|torchaudio|main|2685e6b932d666435a198bfed472015bb5b87a55|https://github.com/pytorch/audio
-centos|pytorch|torchaudio|main|2685e6b932d666435a198bfed472015bb5b87a55|https://github.com/pytorch/audio
-ubuntu|pytorch|ao|main|0df571af05e2fdc516022c98f6686a6888b81e1a|https://github.com/pytorch/ao
-centos|pytorch|ao|main|0df571af05e2fdc516022c98f6686a6888b81e1a|https://github.com/pytorch/ao
+ubuntu|pytorch|torchaudio|main|87ff22e49ed0e92576c4935ccb8c143daac4a3cd|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|main|87ff22e49ed0e92576c4935ccb8c143daac4a3cd|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|5e90c477f6d49a17540bd7431728bfb2457610ee|https://github.com/pytorch/ao
+centos|pytorch|ao|main|5e90c477f6d49a17540bd7431728bfb2457610ee|https://github.com/pytorch/ao
diff --git a/requirements-build.txt b/requirements-build.txt
index 773077f334a51..a26e156f035d4 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -9,3 +9,4 @@ pyyaml==6.0.2
 requests==2.32.4
 six==1.17.0  # dependency chain: NNPACK -> PeachPy -> six
 typing-extensions==4.14.1
+pip  # not technically needed, but this makes setup.py invocation work
diff --git a/setup.py b/setup.py
index c1803ef25567b..041bd278c3746 100644
--- a/setup.py
+++ b/setup.py
@@ -227,9 +227,6 @@
 #      Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
 #      By default, It is only enabled on Windows.
 #
-#   USE_PRIORITIZED_TEXT_FOR_LD
-#      Uses prioritized text form cmake/prioritized_text.txt for LD
-#
 #   BUILD_LIBTORCH_WHL
 #      Builds libtorch.so and its dependencies as a wheel
 #
@@ -259,7 +256,7 @@
 
 
 # Also update `project.requires-python` in pyproject.toml when changing this
-python_min_version = (3, 9, 0)
+python_min_version = (3, 10, 0)
 python_min_version_str = ".".join(map(str, python_min_version))
 if sys.version_info < python_min_version:
     print(
@@ -323,7 +320,6 @@
     IS_LINUX,
     IS_WINDOWS,
 )
-from tools.setup_helpers.generate_linker_script import gen_linker_script
 
 
 def str2bool(value: str | None) -> bool:
@@ -386,12 +382,6 @@ def _get_package_path(package_name: str) -> Path:
 BUILD_LIBTORCH_WHL = str2bool(os.getenv("BUILD_LIBTORCH_WHL"))
 BUILD_PYTHON_ONLY = str2bool(os.getenv("BUILD_PYTHON_ONLY"))
 
-# set up appropriate env variables
-if BUILD_LIBTORCH_WHL:
-    # Set up environment variables for ONLY building libtorch.so and not libtorch_python.so
-    # functorch is not supported without python
-    os.environ["BUILD_FUNCTORCH"] = "OFF"
-
 if BUILD_PYTHON_ONLY:
     os.environ["BUILD_LIBTORCHLESS"] = "ON"
     os.environ["LIBTORCH_LIB_PATH"] = (_get_package_path("torch") / "lib").as_posix()
@@ -1254,21 +1244,6 @@ def run(self) -> None:
     def build_extensions(self) -> None:
         self.create_compile_commands()
 
-        build_lib = Path(self.build_lib).resolve()
-
-        # Copy functorch extension
-        for ext in self.extensions:
-            if ext.name != "functorch._C":
-                continue
-            fullname = self.get_ext_fullname(ext.name)
-            filename = Path(self.get_ext_filename(fullname))
-            src = filename.with_stem("functorch")
-            dst = build_lib / filename
-            if src.exists():
-                report(f"Copying {ext.name} from {src} to {dst}")
-                dst.parent.mkdir(parents=True, exist_ok=True)
-                self.copy_file(src, dst)
-
         super().build_extensions()
 
     def get_outputs(self) -> list[str]:
@@ -1556,11 +1531,6 @@ def make_relative_rpath_args(path: str) -> list[str]:
     )
     ext_modules.append(C)
 
-    # These extensions are built by cmake and copied manually in build_extensions()
-    # inside the build_ext implementation
-    if cmake_cache_vars["BUILD_FUNCTORCH"]:
-        ext_modules.append(Extension(name="functorch._C", sources=[]))
-
     cmdclass = {
         "bdist_wheel": bdist_wheel,
         "build_ext": build_ext,
@@ -1627,26 +1597,6 @@ def main() -> None:
     if BUILD_PYTHON_ONLY:
         install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"]
 
-    if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")):
-        gen_linker_script(
-            filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
-        )
-        linker_script_path = os.path.abspath("cmake/linker_script.ld")
-        os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}"
-        os.environ["CFLAGS"] = (
-            os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
-        )
-        os.environ["CXXFLAGS"] = (
-            os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
-        )
-    elif platform.system() == "Linux" and platform.processor() == "aarch64":
-        print_box(
-            """
-            WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
-            To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-            """
-        )
-
     # Parse the command line and check the arguments before we proceed with
     # building deps and setup. We need to set values so `--help` works.
     dist = Distribution()
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 21335a3617b43..d01d41d37997e 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -658,13 +658,6 @@
     "Iterable",
     "Optional"
   ],
-  "torch.onnx": [
-    "Dict",
-    "OperatorExportTypes",
-    "Optional",
-    "TensorProtoDataType",
-    "TrainingMode"
-  ],
   "torch.overrides": [
     "BaseTorchFunctionMode",
     "TorchFunctionMode",
@@ -2207,21 +2200,6 @@
     "Tuple",
     "abstractmethod"
   ],
-  "torch.onnx.verification": [
-    "Any",
-    "Callable",
-    "Collection",
-    "Dict",
-    "FrozenSet",
-    "List",
-    "Mapping",
-    "Number",
-    "Optional",
-    "Sequence",
-    "Set",
-    "Tuple",
-    "Union"
-  ],
   "torch.quantization.fx": [
     "convert",
     "fuse",
@@ -2685,35 +2663,6 @@
     "DeferredMtiaCallError",
     "StreamContext"
   ],
-  "torch.onnx.symbolic_helper": [
-    "Any",
-    "Callable",
-    "List",
-    "Literal",
-    "NoReturn",
-    "Number",
-    "Optional",
-    "Sequence",
-    "Set",
-    "Tuple",
-    "Union"
-  ],
-  "torch.onnx.symbolic_opset18": [
-    "amax",
-    "amin",
-    "aminmax",
-    "embedding_bag",
-    "linalg_vector_norm",
-    "max",
-    "maximum",
-    "min",
-    "minimum"
-  ],
-  "torch.onnx.symbolic_opset20": [
-    "_affine_grid_generator",
-    "_grid_sampler",
-    "convert_grid_sample_mode"
-  ],
   "torch.utils.data.datapipes.dataframe.dataframe_wrapper": [
     "Any",
     "Optional"
diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py
index 8e1525b858795..923ffa16fa02d 100644
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import copy
 
diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
index 528fe9b83c65b..1725f288cf7c0 100644
--- a/test/ao/sparsity/test_composability.py
+++ b/test/ao/sparsity/test_composability.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 
 import torch
diff --git a/test/ao/sparsity/test_data_scheduler.py b/test/ao/sparsity/test_data_scheduler.py
index cc4d8ddae63f1..de0a885f0153e 100644
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import copy
 import warnings
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index 5217049aafdfd..c333138769a45 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import copy
 import itertools
diff --git a/test/ao/sparsity/test_kernels.py b/test/ao/sparsity/test_kernels.py
index 1ffdca5fd343a..86d8ad4d3a62d 100644
--- a/test/ao/sparsity/test_kernels.py
+++ b/test/ao/sparsity/test_kernels.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import copy
 import io
diff --git a/test/ao/sparsity/test_parametrization.py b/test/ao/sparsity/test_parametrization.py
index ac79b6309cf99..95d90725d3c6b 100644
--- a/test/ao/sparsity/test_parametrization.py
+++ b/test/ao/sparsity/test_parametrization.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 
 import torch
diff --git a/test/ao/sparsity/test_scheduler.py b/test/ao/sparsity/test_scheduler.py
index b563efac73bd7..0477b70fd8783 100644
--- a/test/ao/sparsity/test_scheduler.py
+++ b/test/ao/sparsity/test_scheduler.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import warnings
 
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index ca80fa7dde7fe..86e26e5ca11ee 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import itertools
 import re
diff --git a/test/ao/sparsity/test_sparsity_utils.py b/test/ao/sparsity/test_sparsity_utils.py
index 45385bca6f6dd..f2deaeb1ecc29 100644
--- a/test/ao/sparsity/test_sparsity_utils.py
+++ b/test/ao/sparsity/test_sparsity_utils.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 
 import logging
diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py
index c62cc3d305394..8124904527677 100644
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 import copy
 import random
 
diff --git a/test/bottleneck_test/test.py b/test/bottleneck_test/test.py
deleted file mode 100644
index 0549a6372ab95..0000000000000
--- a/test/bottleneck_test/test.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Owner(s): ["module: unknown"]
-
-import torch
-
-
-x = torch.ones((3, 3), requires_grad=True)
-(3 * x).sum().backward()
diff --git a/test/bottleneck_test/test_args.py b/test/bottleneck_test/test_args.py
deleted file mode 100644
index 38fc03701bf24..0000000000000
--- a/test/bottleneck_test/test_args.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Owner(s): ["module: unknown"]
-
-import argparse
-
-import torch
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Required args. Raises error if they aren't passed.
-    parser.add_argument("--foo", help="foo", required=True)
-    parser.add_argument("--bar", help="bar", required=True)
-    _ = parser.parse_args()
-
-    x = torch.ones((3, 3), requires_grad=True)
-    (3 * x).sum().backward()
diff --git a/test/bottleneck_test/test_cuda.py b/test/bottleneck_test/test_cuda.py
deleted file mode 100644
index d9f9b0b8274f2..0000000000000
--- a/test/bottleneck_test/test_cuda.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Owner(s): ["module: unknown"]
-
-import torch
-import torch.nn as nn
-
-
-class Model(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.linear = nn.Linear(20, 20)
-
-    def forward(self, input):
-        out = self.linear(input[:, 10:30])
-        return out.sum()
-
-
-def main():
-    data = torch.randn(10, 50).cuda()
-    model = Model().cuda()
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
-    for _ in range(10):
-        optimizer.zero_grad()
-        loss = model(data)
-        loss.backward()
-        optimizer.step()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/cpp/aoti_inference/test.cpp b/test/cpp/aoti_inference/test.cpp
index cf606d242d9fe..31305b2ee1f93 100644
--- a/test/cpp/aoti_inference/test.cpp
+++ b/test/cpp/aoti_inference/test.cpp
@@ -879,12 +879,15 @@ void test_cuda_alloc_test() {
   if (cudaStatus != cudaSuccess || device_idx == -1) {
     throw std::runtime_error("cudaGetDevice failed!");
   }
+
+  c10::cuda::CUDACachingAllocator::emptyCache();
   c10::cuda::CUDACachingAllocator::DeviceStats stats =
       c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
-  size_t initTorchActive = stats.active_bytes[0].current;
+  size_t initTorchActive = stats.allocated_bytes[0].current;
   auto runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
       model_so_path);
-  size_t torchActive = stats.active_bytes[0].current;
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  size_t torchActive = stats.allocated_bytes[0].current;
 
   ASSERT_EQ(initTorchActive + DATASIZE, torchActive);
 
@@ -1113,8 +1116,7 @@ TEST(AotInductorTest, MultiStreamTestCuda) {
   test_multi_cuda_streams("cuda");
 }
 
-// TODO: ENABLE CUDACachingAllocator Test
-TEST(DISABLED_AotInductorTest, CudaAllocTestCuda) {
+TEST(AotInductorTest, CudaAllocTestCuda) {
   test_cuda_alloc_test();
 }
 #endif
diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt
index 1b4752ed9089f..77636a2d10932 100644
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@@ -40,26 +40,34 @@ set(NATIVERT_TEST_SRCS
   ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
   ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
   ${TORCH_ROOT}/torch/nativert/kernels/KernelHandlerRegistry.cpp
-  ${TORCH_ROOT}/torch/nativert/kernels/TritonKernel.cpp
   ${TORCH_ROOT}/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/TritonKernel.cpp
   ${TORCH_ROOT}/torch/nativert/executor/DelegateExecutor.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/AOTInductorDelegateExecutor.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/ETCallDelegateKernel.cpp
+  ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
 )
 
-if(USE_CUDA)
+if(USE_CUDA OR USE_ROCM)
   list(APPEND NATIVERT_TEST_SRCS ${TORCH_ROOT}/torch/nativert/executor/triton/CudaTritonKernelManager.cpp)
-endif(MSVC)
-
+  list(APPEND NATIVERT_TEST_SRCS ${TORCH_ROOT}/torch/nativert/executor/AOTInductorModelContainerCudaShim.cpp)
+endif()
 
 add_executable(test_nativert
   ${TORCH_ROOT}/test/cpp/common/main.cpp
   ${NATIVERT_TEST_SRCS}
 )
 
+if(MSVC)
+  target_compile_definitions(test_nativert PRIVATE NATIVERT_MSVC_TEST)
+endif()
+
 # TODO temporary until we can delete the old gtest polyfills.
 target_compile_definitions(test_nativert PRIVATE USE_GTEST)
 
 set(NATIVERT_TEST_DEPENDENCIES torch gtest_main)
 
+target_link_libraries(test_nativert PRIVATE ${CMAKE_DL_LIBS})
 target_link_libraries(test_nativert PRIVATE ${NATIVERT_TEST_DEPENDENCIES})
 target_link_libraries(test_nativert PRIVATE fmt::fmt-header-only)
 target_include_directories(test_nativert PRIVATE ${ATen_CPU_INCLUDE})
diff --git a/test/cpp/nativert/test_aoti_model_container_registration.cpp b/test/cpp/nativert/test_aoti_model_container_registration.cpp
new file mode 100644
index 0000000000000..9a5d67a9c3c14
--- /dev/null
+++ b/test/cpp/nativert/test_aoti_model_container_registration.cpp
@@ -0,0 +1,16 @@
+#include <gtest/gtest.h>
+
+#include <torch/nativert/executor/AOTInductorDelegateExecutor.h>
+
+using namespace ::testing;
+using namespace torch::nativert;
+
+TEST(AOTIModelContainerRegistrationTests, TestRegister) {
+  EXPECT_TRUE(AOTIModelContainerRunnerRegistry()->Has(at::kCPU));
+
+#if defined(USE_CUDA) || defined(USE_ROCM)
+  EXPECT_TRUE(AOTIModelContainerRunnerRegistry()->Has(at::kCUDA));
+#else
+  EXPECT_FALSE(AOTIModelContainerRunnerRegistry()->Has(at::kCUDA));
+#endif // USE_CUDA
+}
diff --git a/test/cpp/nativert/test_itree.cpp b/test/cpp/nativert/test_itree.cpp
index 4748c11c3e17a..15ff600fe73d5 100644
--- a/test/cpp/nativert/test_itree.cpp
+++ b/test/cpp/nativert/test_itree.cpp
@@ -4,6 +4,7 @@
 #include <fmt/format.h>
 
 #include <c10/util/Enumerate.h>
+#include <torch/custom_class.h>
 #include <torch/nativert/detail/ITree.h>
 
 namespace torch::nativert::detail {
@@ -1147,4 +1148,200 @@ TEST(ITreeTest, ToAtenType) {
       c10::TypeKind::AnyType);
 }
 
+TEST(ITreeTest, KeyedJaggedTensorUnflatten) {
+  // Test KeyedJaggedTensor pytree node registration
+  // KeyedJaggedTensor has 6 tensor fields: _values, _weights, _lengths,
+  // _offsets, _stride_per_key_per_rank, _inverse_indices
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "torchrec.sparse.jagged_tensor.KeyedJaggedTensor",
+    "context": "[\"key1\", \"key2\"]",
+    "children_spec": [
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      }
+    ]
+  }
+]
+  )";
+
+  auto [graph, valuePtrs] = makeValues(6);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+
+  // Create mock tensor values for the 6 fields
+  std::vector<c10::IValue> flats = {
+      c10::IValue(1), // _values
+      c10::IValue(2), // _weights
+      c10::IValue(3), // _lengths
+      c10::IValue(4), // _offsets
+      c10::IValue(5), // _stride_per_key_per_rank
+      c10::IValue(6), // _inverse_indices tensor part
+  };
+
+  // Test unflatten - this will create a generic tuple since we don't have
+  // the actual KeyedJaggedTensor constructor available in tests
+  auto itree = itreeUnflatten(flats, spec);
+  EXPECT_TRUE(itree.isTuple());
+  EXPECT_EQ(itree.toTupleRef().elements().size(), 6);
+
+  // Verify the values match what we put in
+  for (size_t i = 0; i < 6; i++) {
+    EXPECT_EQ(itree.toTupleRef().elements()[i], flats[i]);
+  }
+
+  // Verify spec has correct number of children and structure
+  EXPECT_EQ(spec.children().size(), 6);
+  EXPECT_EQ(spec.numIValues(), 6);
+  EXPECT_FALSE(spec.isIValue());
+  EXPECT_EQ(
+      spec.uniformName(), "torchrec.sparse.jagged_tensor.KeyedJaggedTensor");
+}
+
+TEST(ITreeTest, KeyedJaggedTensorNodeRegistration) {
+  // Test that KeyedJaggedTensor pytree node is properly registered
+
+  // Verify the KeyedJaggedTensor node is in the registry by attempting
+  // to load a spec that references it
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "torchrec.sparse.jagged_tensor.KeyedJaggedTensor",
+    "context": "[\"key1\", \"key2\"]",
+    "children_spec": [
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      }
+    ]
+  }
+]
+  )";
+
+  auto [graph, valuePtrs] = makeValues(6);
+
+  // This should not throw - if KeyedJaggedTensor wasn't registered,
+  // we'd get an exception about "Unknown pytree node type"
+  EXPECT_NO_THROW({
+    const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+
+    // Verify the spec loaded correctly
+    EXPECT_FALSE(spec.isIValue());
+    EXPECT_EQ(
+        spec.uniformName(), "torchrec.sparse.jagged_tensor.KeyedJaggedTensor");
+    EXPECT_EQ(spec.children().size(), 6);
+    EXPECT_EQ(spec.numIValues(), 6);
+
+    // Verify context is parsed correctly
+    EXPECT_FALSE(spec.context().is_null());
+    EXPECT_TRUE(spec.context().is_array());
+    EXPECT_EQ(spec.context().size(), 2);
+  });
+}
+
+TEST(ITreeTest, JaggedTensorNodeRegistration) {
+  // Test that JaggedTensor pytree node is also properly registered
+
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "torchrec.sparse.jagged_tensor.JaggedTensor",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      }
+    ]
+  }
+]
+  )";
+
+  auto [graph, valuePtrs] = makeValues(4);
+
+  // This should not throw - if JaggedTensor wasn't registered,
+  // we'd get an exception about "Unknown pytree node type"
+  EXPECT_NO_THROW({
+    const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+
+    // Verify the spec loaded correctly
+    EXPECT_FALSE(spec.isIValue());
+    EXPECT_EQ(spec.uniformName(), "torchrec.sparse.jagged_tensor.JaggedTensor");
+    EXPECT_EQ(spec.children().size(), 4);
+    EXPECT_EQ(spec.numIValues(), 4);
+  });
+}
+
 } // namespace torch::nativert::detail
diff --git a/test/cpp/nativert/test_triton_kernel_manager_registration.cpp b/test/cpp/nativert/test_triton_kernel_manager_registration.cpp
index ca864158e3122..8cedb84abf218 100644
--- a/test/cpp/nativert/test_triton_kernel_manager_registration.cpp
+++ b/test/cpp/nativert/test_triton_kernel_manager_registration.cpp
@@ -6,9 +6,20 @@ using namespace ::testing;
 using namespace torch::nativert;
 
 TEST(TritonKernelManagerRegistrationTests, TestRegister) {
-#ifndef USE_CUDA
-  EXPECT_TRUE(create_cuda_triton_kernel_manager == nullptr);
+  EXPECT_TRUE(TritonKernelManagerRegistry()->Has(at::kCPU));
+
+#ifdef USE_CUDA
+#ifdef USE_ROCM
+  EXPECT_TRUE(TritonKernelManagerRegistry()->Has(at::kHIP));
+  EXPECT_FALSE(TritonKernelManagerRegistry()->Has(at::kCUDA));
+
+#else
+  EXPECT_TRUE(TritonKernelManagerRegistry()->Has(at::kCUDA));
+  EXPECT_FALSE(TritonKernelManagerRegistry()->Has(at::kHIP));
+
+#endif // USE_ROCM
 #else
-  EXPECT_FALSE(create_cuda_triton_kernel_manager == nullptr);
+  EXPECT_FALSE(TritonKernelManagerRegistry()->Has(at::kCUDA));
+  EXPECT_FALSE(TritonKernelManagerRegistry()->Has(at::kHIP));
 #endif // USE_CUDA
 }
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
index 306a882627d4b..58c812b08cccb 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@@ -363,6 +363,25 @@ void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, ui
   stack[0] = from(res);
 }
 
+Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
+  return copy_(dst, src, non_blocking);
+}
+
+void boxed_my_copy_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor tensor_res = my_copy_(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<bool>(stack[2]));
+  stack[0] = from(tensor_res);
+}
+
+Tensor my_clone(Tensor t) {
+  return clone(t);
+}
+
+void boxed_my_clone(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor tensor_res = my_clone(to<Tensor>(stack[0]));
+  stack[0] = from(tensor_res);
+}
+
+
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
   m.def("my_empty_like(Tensor t) -> Tensor");
@@ -371,6 +390,8 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_narrow(Tensor t, int dim, int start, int length) -> Tensor");
   m.def("my_new_empty_dtype_variant(Tensor t) -> Tensor");
   m.def("my_new_zeros_dtype_variant(Tensor t) -> Tensor");
+  m.def("my_copy_(Tensor dst, Tensor src, bool non_blocking) -> Tensor");
+  m.def("my_clone(Tensor t) -> Tensor");
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
@@ -380,6 +401,8 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("my_is_cpu", &boxed_my_is_cpu);
   m.impl("my_new_empty_dtype_variant", &boxed_my_new_empty_dtype_variant);
   m.impl("my_new_zeros_dtype_variant", &boxed_my_new_zeros_dtype_variant);
+  m.impl("my_copy_", &boxed_my_copy_);
+  m.impl("my_clone", &boxed_my_clone);
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
@@ -415,7 +438,6 @@ void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outp
   stack[0] = from(res);
 }
 
-
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
   m.def("my_amax(Tensor a) -> Tensor");
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
index 074461d352740..0000d667e1cbc 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@@ -242,6 +242,32 @@ def my_narrow(t, dim, start, length) -> Tensor:
     return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
 
 
+def my_copy_(dst, src, non_blocking) -> Tensor:
+    """
+    Returns tensor dst that is updated with src elements.
+
+    Args:
+        dst: Destination tensor
+        src: Source tensor
+        non_blocking: bool
+
+    Returns: Updated tensor
+    """
+    return torch.ops.libtorch_agnostic.my_copy_.default(dst, src, non_blocking)
+
+
+def my_clone(t) -> Tensor:
+    """
+    Returns a clone of input tensor.
+
+    Args:
+        t: Input tensor
+
+    Returns: Cloned tensor
+    """
+    return torch.ops.libtorch_agnostic.my_clone.default(t)
+
+
 def test_device_guard(device_index) -> int:
     """
     Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
index 0f471e8132a60..35610332a36cd 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@@ -345,6 +345,28 @@ def test_my_new_zeros_dtype_variant(self, device):
             ref_out = t.new_zeros((2, 5), dtype=torch.float)
             self.assertEqual(out, ref_out, exact_device=True)
 
+        def test_my_copy_(self, device):
+            import libtorch_agnostic
+
+            dst = torch.empty(2, 5, device=device)
+            src = torch.randn(2, 5, device=device)
+
+            result = libtorch_agnostic.ops.my_copy_(dst, src, False)
+            expected = src
+            self.assertEqual(result, expected)
+            self.assertEqual(result.data_ptr(), dst.data_ptr())
+
+        def test_my_clone(self, device):
+            import libtorch_agnostic
+
+            t = torch.randn(2, 5, device=device)
+
+            result = libtorch_agnostic.ops.my_clone(t)
+            expected = t.clone()
+            self.assertEqual(result, expected)
+            self.assertNotEqual(result.data_ptr(), expected.data_ptr())
+            self.assertEqual(result.stride(), expected.stride())
+
     instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
 
 if __name__ == "__main__":
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/README.md b/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
index 83ec85b1055c2..fd3eaf649abec 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
@@ -97,8 +97,8 @@ There are 4 DSOs in torch_openreg, and the dependencies between them are as foll
 - `csrc/`: Core device implementation, including operator registration, runtime, etc.
   - `csrc/aten/`: Operator registration
     - `csrc/aten/native/`: Specific operator implementations for the OpenReg device.
-      - `csrc/aten/OpenRegMinimal.cpp`: The most minimal set of operator implementations (allowing for the creation of Tensors and related operations upon completion).
-      - `csrc/aten/OpenRegExtra.cpp`: Implementations for other types of operators.
+      - `csrc/aten/native/OpenRegMinimal.cpp`: The most minimal set of operator implementations (allowing for the creation of Tensors and related operations upon completion).
+      - `csrc/aten/native/OpenRegExtra.cpp`: Implementations for other types of operators.
     - `csrc/runtime/`: Implementations for Host memory, device memory, Guard, Hooks, etc.
 - `third_party/`: A C++ library that simulates a CUDA-like device using the CPU.
 - `torch_openreg/`: Python interface implementation (Python code and C++ Bindings).
@@ -124,6 +124,16 @@ There are 4 DSOs in torch_openreg, and the dependencies between them are as foll
     - Per-operator Fallback: See `sub.Tensor`
     - Global Fallback: See `wrapper_cpu_fallback`
 
+### Autoload
+
+- Autoload Machanism
+
+    When `import torch`, installed accelerators (such as `torch_openreg`) will be automatically loaded, achieving the same experience as the built-in backends.
+
+  - Registering the backend with Python `entry points`: See `setup` in `setup.py`
+  - Adding a callable function for backend initialization: See `_autoload` in `torch_openreg/__init__.py`
+  - Dynamically loading the backend without explicit imports: See [Usage Example](#usage-example)
+
 ## Installation and Usage
 
 ### Installation
@@ -139,7 +149,6 @@ After installation, you can use the `openreg` device in Python just like any oth
 
 ```python
 import torch
-import torch_openreg
 
 if not torch.openreg.is_available():
     print("OpenReg backend is not available in this build.")
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
index 566bacd06e9ad..ac39453a7f4d1 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
@@ -36,7 +36,7 @@ OPENREG_EXPORT c10::DeviceIndex device_count() noexcept {
   static int count = []() {
     try {
       auto result = device_count_impl();
-      TORCH_INTERNAL_ASSERT(
+      TORCH_CHECK(
           result <= std::numeric_limits<c10::DeviceIndex>::max(),
           "Too many devices, DeviceIndex overflowed");
       return result;
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h
index f0150fe680fb8..ad89b7a208cb4 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h
@@ -1,19 +1,26 @@
+#pragma once
+
 #include <c10/core/Device.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 
 #include <include/openreg.h>
 
+#include "OpenRegEvent.h"
 #include "OpenRegFunctions.h"
+#include "OpenRegStream.h"
 
 namespace c10::openreg {
 
-// Device guard registration
 struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
-  static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1;
+  static constexpr DeviceType static_type = c10::DeviceType::PrivateUse1;
 
   OpenRegGuardImpl() = default;
+
   explicit OpenRegGuardImpl(c10::DeviceType t) {
-    TORCH_INTERNAL_ASSERT(t == static_type);
+    TORCH_CHECK(
+        t == static_type,
+        "OpenRegGuardImpl initialized with non-PrivateUse1 DeviceType: ",
+        t);
   }
 
   /**
@@ -27,7 +34,8 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    * Set the current device to Device, and return the previous c10::Device.
    */
   c10::Device exchangeDevice(c10::Device d) const override {
-    TORCH_CHECK(d.is_privateuseone());
+    TORCH_CHECK(
+        d.is_privateuseone(), "Excepted a PrivateUse1 device, but got ", d);
 
     auto old_device_index = ExchangeDevice(d.index());
     return c10::Device(static_type, old_device_index);
@@ -45,7 +53,8 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    * Set the current device to c10::Device.
    */
   void setDevice(c10::Device d) const override {
-    TORCH_CHECK(d.is_privateuseone());
+    TORCH_CHECK(
+        d.is_privateuseone(), "Excepted a PrivateUse1 device, but got ", d);
 
     set_device(d.index());
   }
@@ -55,8 +64,6 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    * (so, e.g., this can be called from a destructor).
    */
   void uncheckedSetDevice(c10::Device d) const noexcept override {
-    TORCH_CHECK(d.is_privateuseone());
-
     set_device(d.index());
   }
 
@@ -64,32 +71,31 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    * Get the current stream for a given device.
    */
   c10::Stream getStream(c10::Device d) const noexcept override {
-    return c10::Stream(c10::Stream::DEFAULT, d);
+    return getCurrentOpenRegStream(d.index()).unwrap();
   }
 
   /**
    * Get the default stream for a given device.
    */
   c10::Stream getDefaultStream(c10::Device d) const override {
-    return c10::Stream(c10::Stream::DEFAULT, d);
+    return getDefaultOpenRegStream(d.index());
   }
 
   /**
-   * Get a stream from the global pool for a given device.
+   * Return a new stream for a given device and priority. The stream will be
+   * copied and shared around, device backend should be able to correctly handle
+   * the lifetime of the stream.
    */
-  c10::Stream getStreamFromGlobalPool(
-      c10::Device d,
-      bool isHighPriority = false) const override {
-    return c10::Stream(c10::Stream::DEFAULT, d);
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPool(priority, d.index());
   }
 
   /**
-   * Return a new stream for a given device and priority. The stream will be
-   * copied and shared around, device backend should be able to correctly handle
-   * the lifetime of the stream.
+   * Get a stream from the global pool for a given device.
    */
-  c10::Stream getNewStream(c10::Device d, int priority = 0) const override {
-    return c10::Stream(c10::Stream::DEFAULT, d);
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return getStreamFromPool(isHighPriority, d.index());
   }
 
   /**
@@ -98,14 +104,37 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    * to set the current device to match the device of this stream.
    */
   c10::Stream exchangeStream(c10::Stream s) const noexcept override {
-    return s;
+    const OpenRegStream stream(s);
+    const auto old_stream = getCurrentOpenRegStream(s.device().index());
+    setCurrentOpenRegStream(stream);
+    return old_stream.unwrap();
+  }
+
+  /**
+   * Get the number of devices.
+   *
+   * WARNING: This is REQUIRED to not raise an exception.
+   * If there is some sort of problem, e.g., driver error,
+   * you should report that there are zero available devices.
+   */
+  DeviceIndex deviceCount() const noexcept override {
+    return device_count();
   }
 
   /**
    * Destroys the given event.
    */
   void destroyEvent(void* event, const c10::DeviceIndex device_index)
-      const noexcept override {}
+      const noexcept override {
+    if (!event)
+      return;
+
+    auto or_event = static_cast<orEvent_t>(event);
+    auto orig_device = current_device();
+    set_device(device_index);
+    orEventDestroy(or_event);
+    set_device(orig_device);
+  }
 
   /**
    * Increments the event's version and enqueues a job with this version
@@ -118,10 +147,40 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
       const c10::Stream& stream,
       const c10::DeviceIndex device_index,
       const c10::EventFlag flag) const override {
-    static int event_id = 1;
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    orEvent_t or_event = static_cast<orEvent_t>(*event);
+    OpenRegStream or_stream{stream};
 
-    if (!*event)
-      *event = reinterpret_cast<void*>(event_id++);
+    const auto orig_device = current_device();
+    set_device(stream.device().index());
+
+    if (!or_event) {
+      auto or_flag = orEventDisableTiming;
+      switch (flag) {
+        case EventFlag::PYTORCH_DEFAULT:
+          or_flag = orEventDisableTiming;
+          break;
+        case EventFlag::BACKEND_DEFAULT:
+          or_flag = orEventEnableTiming;
+          break;
+        default:
+          TORCH_CHECK(false, "Received unknown flag");
+      }
+
+      orEventCreateWithFlags(&or_event, or_flag);
+    }
+
+    orEventRecord(or_event, or_stream);
+    *event = or_event;
+
+    set_device(orig_device);
   }
 
   /**
@@ -132,7 +191,17 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    * When the stream reaches this command it will stop processing
    * additional commands until that version of the event is marked as recorded.
    */
-  void block(void* event, const c10::Stream& stream) const override {}
+  void block(void* event, const c10::Stream& stream) const override {
+    if (!event)
+      return;
+
+    orEvent_t or_event = static_cast<orEvent_t>(event);
+    OpenRegStream or_stream{stream};
+    const auto orig_device = current_device();
+    set_device(stream.device().index());
+    orStreamWaitEvent(or_stream, or_event, 0);
+    set_device(orig_device);
+  }
 
   /**
    * Returns true if (and only if)
@@ -141,47 +210,56 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    * Returns false otherwise.
    */
   bool queryEvent(void* event) const override {
-    return true;
-  }
+    if (!event)
+      return true;
 
-  /**
-   * Get the number of devices.  WARNING: This is REQUIRED to not raise
-   * an exception.  If there is some sort of problem, e.g., driver error,
-   * you should report that there are zero available devices.
-   */
-  c10::DeviceIndex deviceCount() const noexcept override {
-    int device_index = -1;
-    orGetDeviceCount(&device_index);
-    return device_index;
+    orEvent_t or_event = static_cast<orEvent_t>(event);
+    const orError_t err = orEventQuery(or_event);
+
+    return err == orSuccess ? true : false;
   }
+
   /**
    * Return true if all the work previously enqueued on the stream for
    * asynchronous execution has completed running on the device.
    */
   bool queryStream(const c10::Stream& stream) const override {
-    return true;
+    OpenRegStream or_stream{stream};
+    return or_stream.query();
   }
 
   /**
    * Wait (by blocking the calling thread) until all the work previously
    * enqueued on the stream has completed running on the device.
    */
-  void synchronizeStream(const c10::Stream& stream) const override {}
+  void synchronizeStream(const c10::Stream& stream) const override {
+    OpenRegStream or_stream{stream};
+    or_stream.synchronize();
+  }
 
   /**
    * Wait (by blocking the calling thread) until all the work previously
    * recorded on the event has completed running on the device.
    */
-  void synchronizeEvent(void* event) const override {}
+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+
+    orEvent_t or_event = static_cast<orEvent_t>(event);
+    orEventSynchronize(or_event);
+  }
 
   /**
-   * Ensure the caching allocator (if any) is aware that the given DataPtr is
-   * being used on the given stream, and that it should thus avoid recycling the
-   * DataPtr until all work on that stream is done.
+   * Wait (by blocking the calling thread) until all the work has
+   * completed running on the device.
    */
-  void recordDataPtrOnStream(
-      const c10::DataPtr& data_ptr,
-      const c10::Stream& stream) const override {}
+  void synchronizeDevice(const c10::DeviceIndex device_index) const override {
+    DeviceIndex orig_device{-1};
+    auto orig_devicec = current_device();
+    set_device(device_index);
+    orDeviceSynchronize();
+    set_device(orig_device);
+  }
 
   /**
    * Fetch the elapsed time between two recorded events.
@@ -190,7 +268,20 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
       void* event1,
       void* event2,
       const c10::DeviceIndex device_index) const override {
-    return 1;
+    TORCH_CHECK(
+        event1 && event2,
+        "Both events must be recorded before calculating elapsed time.");
+    auto orig_device = current_device();
+    set_device(device_index);
+
+    orEvent_t or_event1 = static_cast<orEvent_t>(event1);
+    orEvent_t or_event2 = static_cast<orEvent_t>(event2);
+    float time_ms = 0;
+    orEventElapsedTime(&time_ms, or_event1, or_event2);
+
+    set_device(orig_device);
+
+    return static_cast<double>(time_ms);
   }
 };
 
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h
index 656fba8eae484..e6eb0c6f26083 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <ATen/core/CachingHostAllocator.h>
 #include <ATen/detail/PrivateUse1HooksInterface.h>
 
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHostAllocator.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHostAllocator.h
index edef545a27835..264276661913b 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHostAllocator.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHostAllocator.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <ATen/core/CachingHostAllocator.h>
 
 #include <c10/core/Allocator.h>
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegSerialization.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegSerialization.h
index 559e92ea82f7b..759c425745c23 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegSerialization.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegSerialization.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <torch/csrc/jit/serialization/pickler.h>
 
 #define REGISTER_PRIVATEUSE1_SERIALIZATION(                                    \
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/pyproject.toml b/test/cpp_extensions/open_registration_extension/torch_openreg/pyproject.toml
index 774fe5cdf83d0..e67240975d0b5 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/pyproject.toml
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/pyproject.toml
@@ -12,7 +12,7 @@ name = "torch_openreg"
 version = "0.0.1"
 description = "A minimal reference implementation of an out-of-tree backend"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = { text = "BSD-3-Clause" }
 authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }]
 dependencies = [
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py b/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
index 0768653e1ac45..8c1496387570d 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
@@ -28,6 +28,12 @@ def make_relative_rpath_args(path):
 
 
 def get_pytorch_dir():
+    # Disable autoload of the accelerator
+
+    # We must do this for two reasons:
+    # We only need to get the PyTorch installation directory, so whether the accelerator is loaded or not is irrelevant
+    # If the accelerator has been previously built and not uninstalled, importing torch will cause a circular import error
+    os.environ["TORCH_DEVICE_BACKEND_AUTOLOAD"] = "0"
     import torch
 
     return os.path.dirname(os.path.realpath(torch.__file__))
@@ -127,6 +133,7 @@ def main():
         ]
     }
 
+    # LITERALINCLUDE START: SETUP
     setup(
         packages=find_packages(),
         package_data=package_data,
@@ -135,7 +142,13 @@ def main():
             "clean": BuildClean,  # type: ignore[misc]
         },
         include_package_data=False,
+        entry_points={
+            "torch.backends": [
+                "torch_openreg = torch_openreg:_autoload",
+            ],
+        },
     )
+    # LITERALINCLUDE END: SETUP
 
 
 if __name__ == "__main__":
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autograd.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autograd.py
new file mode 100644
index 0000000000000..6b58094e3fde4
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autograd.py
@@ -0,0 +1,42 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import os
+
+import psutil
+
+import torch
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfMPS,
+    skipIfTorchDynamo,
+    skipIfWindows,
+    TestCase,
+)
+
+
+class TestAutograd(TestCase):
+    # Support MPS and Windows platform later and fix torchdynamo issue
+    @skipIfMPS
+    @skipIfWindows()
+    @skipIfTorchDynamo()
+    def test_autograd_init(self):
+        # Make sure autograd is initialized
+        torch.ones(2, requires_grad=True, device="openreg").sum().backward()
+
+        pid = os.getpid()
+        task_path = f"/proc/{pid}/task"
+        all_threads = psutil.Process(pid).threads()
+
+        all_thread_names = set()
+
+        for t in all_threads:
+            with open(f"{task_path}/{t.id}/comm") as file:
+                thread_name = file.read().strip()
+            all_thread_names.add(thread_name)
+
+        for i in range(torch.accelerator.device_count()):
+            self.assertIn(f"pt_autograd_{i}", all_thread_names)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_device.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_device.py
new file mode 100644
index 0000000000000..cb1256b0d63c7
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_device.py
@@ -0,0 +1,32 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import torch
+import torch_openreg  # noqa: F401
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestDevice(TestCase):
+    def test_device_count(self):
+        count = torch.accelerator.device_count()
+        self.assertEqual(count, 2)
+
+    def test_device_switch(self):
+        torch.accelerator.set_device_index(1)
+        self.assertEqual(torch.accelerator.current_device_index(), 1)
+
+        torch.accelerator.set_device_index(0)
+        self.assertEqual(torch.accelerator.current_device_index(), 0)
+
+    def test_device_context(self):
+        device = torch.accelerator.current_device_index()
+        with torch.accelerator.device_index(None):
+            self.assertEqual(torch.accelerator.current_device_index(), device)
+        self.assertEqual(torch.accelerator.current_device_index(), device)
+
+        with torch.accelerator.device_index(1):
+            self.assertEqual(torch.accelerator.current_device_index(), 1)
+        self.assertEqual(torch.accelerator.current_device_index(), device)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_event.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_event.py
new file mode 100644
index 0000000000000..87b938ae43600
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_event.py
@@ -0,0 +1,78 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+
+
+class TestEvent(TestCase):
+    @skipIfTorchDynamo()
+    def test_event_create(self):
+        event = torch.Event(device="openreg")
+        self.assertEqual(event.device.type, "openreg")
+        self.assertEqual(event.device.index, None)
+        self.assertEqual(event.event_id, 0)
+
+        event = torch.Event(device="openreg:1")
+        self.assertEqual(event.device.type, "openreg")
+        self.assertEqual(event.device.index, None)
+        self.assertEqual(event.event_id, 0)
+
+        event = torch.Event()
+        self.assertEqual(event.device.type, "openreg")
+        self.assertEqual(event.device.index, None)
+        self.assertEqual(event.event_id, 0)
+
+        stream = torch.Stream(device="openreg:1")
+        event = stream.record_event()
+        self.assertEqual(event.device.type, "openreg")
+        self.assertEqual(event.device.index, 1)
+        self.assertNotEqual(event.event_id, 0)
+
+    @skipIfTorchDynamo()
+    def test_event_query(self):
+        event = torch.Event()
+        self.assertTrue(event.query())
+
+        stream = torch.Stream(device="openreg:1")
+        event = stream.record_event()
+        event.synchronize()
+        self.assertTrue(event.query())
+
+    @skipIfTorchDynamo()
+    def test_event_record(self):
+        stream = torch.Stream(device="openreg:1")
+        event1 = stream.record_event()
+        self.assertNotEqual(0, event1.event_id)
+
+        event2 = stream.record_event()
+        self.assertNotEqual(0, event2.event_id)
+
+        self.assertNotEqual(event1.event_id, event2.event_id)
+
+    @skipIfTorchDynamo()
+    def test_event_elapsed_time(self):
+        stream = torch.Stream(device="openreg:1")
+
+        event1 = torch.Event(device="openreg:1", enable_timing=True)
+        event1.record(stream)
+        event2 = torch.Event(device="openreg:1", enable_timing=True)
+        event2.record(stream)
+
+        stream.synchronize()
+        self.assertTrue(event1.query())
+        self.assertTrue(event2.query())
+
+        ms = event1.elapsed_time(event2)
+        self.assertTrue(ms > 0)
+
+    @skipIfTorchDynamo()
+    def test_event_wait_stream(self):
+        stream1 = torch.Stream(device="openreg")
+        stream2 = torch.Stream(device="openreg")
+
+        event = stream1.record_event()
+        stream2.wait_event(event)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_memory.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_memory.py
new file mode 100644
index 0000000000000..3d67e16a0f503
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_memory.py
@@ -0,0 +1,31 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+
+
+class TestPinMemory(TestCase):
+    @skipIfTorchDynamo("unsupported aten.is_pinned.default")
+    def test_pin_memory(self):
+        tensor = torch.randn(10)
+        self.assertFalse(tensor.is_pinned())
+        pinned_tensor = tensor.pin_memory()
+        self.assertTrue(pinned_tensor.is_pinned())
+        slice_tensor = pinned_tensor[2:5]
+        self.assertTrue(slice_tensor.is_pinned())
+
+        tensor = torch.randn(10)
+        storage = tensor.storage()
+        self.assertFalse(storage.is_pinned("openreg"))
+        pinned_storage = storage.pin_memory("openreg")
+        self.assertTrue(pinned_storage.is_pinned("openreg"))
+
+        tensor = torch.randn(10)
+        untyped_storage = tensor.untyped_storage()
+        self.assertFalse(untyped_storage.is_pinned("openreg"))
+        pinned_untyped_storage = untyped_storage.pin_memory("openreg")
+        self.assertTrue(pinned_untyped_storage.is_pinned("openreg"))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_misc.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_misc.py
new file mode 100644
index 0000000000000..11d29fe70bba0
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_misc.py
@@ -0,0 +1,162 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import types
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+
+
+class TestBackendModule(TestCase):
+    def test_backend_module_name(self):
+        self.assertEqual(torch._C._get_privateuse1_backend_name(), "openreg")
+        # backend can be renamed to the same name multiple times
+        torch.utils.rename_privateuse1_backend("openreg")
+        with self.assertRaisesRegex(RuntimeError, "has already been set"):
+            torch.utils.rename_privateuse1_backend("dev")
+
+    def test_backend_module_registration(self):
+        def generate_faked_module():
+            return types.ModuleType("fake_module")
+
+        with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"):
+            torch._register_device_module("dev", generate_faked_module())
+        with self.assertRaisesRegex(RuntimeError, "The runtime module of"):
+            torch._register_device_module("openreg", generate_faked_module())
+
+    def test_backend_module_function(self):
+        with self.assertRaisesRegex(RuntimeError, "Try to call torch.openreg"):
+            torch.utils.backend_registration._get_custom_mod_func("func_name_")
+        self.assertTrue(
+            torch.utils.backend_registration._get_custom_mod_func("device_count")() == 2
+        )
+
+
+class TestBackendProperty(TestCase):
+    def test_backend_generate_methods(self):
+        with self.assertRaisesRegex(RuntimeError, "The custom device module of"):
+            torch.utils.generate_methods_for_privateuse1_backend()
+
+        self.assertTrue(hasattr(torch.Tensor, "is_openreg"))
+        self.assertTrue(hasattr(torch.Tensor, "openreg"))
+        self.assertTrue(hasattr(torch.TypedStorage, "is_openreg"))
+        self.assertTrue(hasattr(torch.TypedStorage, "openreg"))
+        self.assertTrue(hasattr(torch.UntypedStorage, "is_openreg"))
+        self.assertTrue(hasattr(torch.UntypedStorage, "openreg"))
+        self.assertTrue(hasattr(torch.nn.Module, "openreg"))
+        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "is_openreg"))
+        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "openreg"))
+
+    def test_backend_tensor_methods(self):
+        x = torch.empty(4, 4)
+        self.assertFalse(x.is_openreg)
+
+        y = x.openreg(torch.device("openreg"))
+        self.assertTrue(y.is_openreg)
+        z = x.openreg(torch.device("openreg:0"))
+        self.assertTrue(z.is_openreg)
+        n = x.openreg(0)
+        self.assertTrue(n.is_openreg)
+
+    @unittest.skip("Need to support Parameter in openreg")
+    def test_backend_module_methods(self):
+        class FakeModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.x = torch.nn.Parameter(torch.randn(3, 3))
+
+            def forward(self):
+                pass
+
+        module = FakeModule()
+        self.assertEqual(module.x.device.type, "cpu")
+        module.openreg()  # type: ignore[misc]
+        self.assertEqual(module.x.device.type, "openreg")
+
+    @unittest.skip("Need to support untyped_storage in openreg")
+    def test_backend_storage_methods(self):
+        x = torch.empty(4, 4)
+
+        x_cpu = x.storage()
+        self.assertFalse(x_cpu.is_openreg)
+        x_openreg = x_cpu.openreg()
+        self.assertTrue(x_openreg.is_openreg)
+
+        y = torch.empty(4, 4)
+
+        y_cpu = y.untyped_storage()
+        self.assertFalse(y_cpu.is_openreg)
+        y_openreg = y_cpu.openreg()
+        self.assertTrue(y_openreg.is_openreg)
+
+    def test_backend_packed_sequence_methods(self):
+        x = torch.rand(5, 3)
+        y = torch.tensor([1, 1, 1, 1, 1])
+
+        z_cpu = torch.nn.utils.rnn.PackedSequence(x, y)
+        self.assertFalse(z_cpu.is_openreg)
+
+        z_openreg = z_cpu.openreg()
+        self.assertTrue(z_openreg.is_openreg)
+
+
+class TestTensorType(TestCase):
+    def test_backend_tensor_type(self):
+        dtypes_map = {
+            torch.bool: "torch.openreg.BoolTensor",
+            torch.double: "torch.openreg.DoubleTensor",
+            torch.float32: "torch.openreg.FloatTensor",
+            torch.half: "torch.openreg.HalfTensor",
+            torch.int32: "torch.openreg.IntTensor",
+            torch.int64: "torch.openreg.LongTensor",
+            torch.int8: "torch.openreg.CharTensor",
+            torch.short: "torch.openreg.ShortTensor",
+            torch.uint8: "torch.openreg.ByteTensor",
+        }
+
+        for dtype, str in dtypes_map.items():
+            x = torch.empty(4, 4, dtype=dtype, device="openreg")
+            self.assertTrue(x.type() == str)
+
+    # Note that all dtype-d Tensor objects here are only for legacy reasons
+    # and should NOT be used.
+    @skipIfTorchDynamo()
+    def test_backend_type_methods(self):
+        # Tensor
+        tensor_cpu = torch.randn([8]).float()
+        self.assertEqual(tensor_cpu.type(), "torch.FloatTensor")
+
+        tensor_openreg = tensor_cpu.openreg()
+        self.assertEqual(tensor_openreg.type(), "torch.openreg.FloatTensor")
+
+        # Storage
+        storage_cpu = tensor_cpu.storage()
+        self.assertEqual(storage_cpu.type(), "torch.FloatStorage")
+
+        tensor_openreg = tensor_cpu.openreg()
+        storage_openreg = tensor_openreg.storage()
+        self.assertEqual(storage_openreg.type(), "torch.storage.TypedStorage")
+
+        class CustomFloatStorage:
+            @property
+            def __module__(self):
+                return "torch." + torch._C._get_privateuse1_backend_name()
+
+            @property
+            def __name__(self):
+                return "FloatStorage"
+
+        try:
+            torch.openreg.FloatStorage = CustomFloatStorage()
+            self.assertEqual(storage_openreg.type(), "torch.openreg.FloatStorage")
+
+            # test custom int storage after defining FloatStorage
+            tensor_openreg = tensor_cpu.int().openreg()
+            storage_openreg = tensor_openreg.storage()
+            self.assertEqual(storage_openreg.type(), "torch.storage.TypedStorage")
+        finally:
+            torch.openreg.FloatStorage = None
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_ops.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_ops.py
new file mode 100644
index 0000000000000..a307f65cbc6fa
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_ops.py
@@ -0,0 +1,282 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import collections
+import functools
+
+import torch
+from torch.nn.attention import SDPBackend
+from torch.testing._internal.common_nn import NNTestCase
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+
+
+SDPAShape = collections.namedtuple(
+    "Sdpa_Shape", ["batch", "num_heads", "seq_len", "head_dim"]
+)
+
+
+class TestFactory(TestCase):
+    def test_empty(self):
+        x = torch.empty(3, device="openreg")
+        self.assertEqual(x.device.type, "openreg")
+        self.assertEqual(x.shape, torch.Size([3]))
+
+        x = torch.empty([2, 3, 4, 5], device="openreg", names=["N", "C", "H", "W"])
+        self.assertEqual(x.device.type, "openreg")
+        self.assertEqual(x.shape, torch.Size([2, 3, 4, 5]))
+
+        with torch._subclasses.fake_tensor.FakeTensorMode():
+            x = torch.empty(3, 3, device="openreg")
+            y = torch.empty(3, 3, device="openreg:0")
+            z = x + y
+            self.assertEqual(z.device.type, "openreg")
+            self.assertEqual(z.shape, torch.Size([3, 3]))
+
+    def test_zeros(self):
+        y = torch.zeros(3, device="openreg")
+        self.assertEqual(y.device.type, "openreg")
+        self.assertEqual(y.shape, torch.Size([3]))
+
+    def test_tensor(self):
+        z = torch.tensor((), device="openreg")
+        self.assertEqual(z.device.type, "openreg")
+        self.assertEqual(z.shape, torch.Size([0]))
+
+
+class TestCopy(TestCase):
+    def test_copy_same_device(self):
+        a = torch.ones(10, device="openreg").clone()
+        self.assertEqual(a, torch.ones(10, device="openreg"))
+
+    def test_cross_device_copy(self):
+        a = torch.rand(10)
+        b = a.to(device="openreg").add(2).to(device="cpu")
+        self.assertEqual(b, a + 2)
+
+    def test_cross_diff_devices_copy(self):
+        a = torch.ones(10, device="openreg:0").to(device="openreg:1").to(device="cpu")
+        self.assertEqual(a, torch.ones(10))
+
+
+class TestOps(TestCase):
+    def test_masked_select(self):
+        tensor_cpu = torch.randn(10)
+        tensor_openreg = tensor_cpu.to(device="openreg")
+        mask = tensor_openreg.gt(0)
+        out = torch.masked_select(tensor_openreg, mask)
+
+        self.assertEqual(out, tensor_cpu.masked_select(tensor_cpu.gt(0)))
+
+    def test_expand(self):
+        x = torch.tensor([[1], [2], [3]], device="openreg")
+        y = x.expand(3, 2)
+        self.assertEqual(y.to(device="cpu"), torch.tensor([[1, 1], [2, 2], [3, 3]]))
+        self.assertEqual(x.data_ptr(), y.data_ptr())
+
+    def test_resize(self):
+        tensor_cpu = torch.randn([4, 4])
+
+        tensor_openreg = tensor_cpu.openreg()
+        self.assertTrue(tensor_openreg.size() == torch.Size([4, 4]))
+
+        storage_openreg = tensor_openreg.storage()
+        self.assertTrue(storage_openreg.size() == 16)
+
+        tensor_openreg.resize_(2, 2, 2, 2)
+        self.assertTrue(tensor_openreg.size() == torch.Size([2, 2, 2, 2]))
+
+        storage_openreg = tensor_openreg.storage()
+        self.assertTrue(storage_openreg.size() == 16)
+
+    def test_printing(self):
+        a = torch.ones(20, device="openreg")
+        print(a)
+
+
+class TestSTUB(TestCase):
+    def test_backend_dispatchstub(self):
+        x_cpu = torch.randn(2, 2, 3, dtype=torch.float32, device="cpu")
+        x_openreg = x_cpu.to("openreg")
+
+        y_cpu = torch.abs(x_cpu)
+        y_openreg = torch.abs(x_openreg)
+        self.assertEqual(y_cpu, y_openreg.cpu())
+
+        o_cpu = torch.randn(2, 2, 6, dtype=torch.float32, device="cpu")
+        o_openreg = o_cpu.to("openreg")
+        # output operand with resize flag is False in TensorIterator.
+        torch.abs(x_cpu, out=o_cpu[:, :, 0:6:2])
+        torch.abs(x_openreg, out=o_openreg[:, :, 0:6:2])
+        self.assertEqual(o_cpu, o_openreg.cpu())
+
+        # output operand with resize flag is True in TensorIterator and
+        # convert output to contiguous tensor in TensorIterator.
+        torch.abs(x_cpu, out=o_cpu[:, :, 0:6:3])
+        torch.abs(x_openreg, out=o_openreg[:, :, 0:6:3])
+        self.assertEqual(o_cpu, o_openreg.cpu())
+
+
+class TestQuantization(TestCase):
+    def test_quantize(self):
+        x = torch.randn(3, 4, 5, dtype=torch.float32, device="openreg")
+        quantized_tensor = torch.quantize_per_tensor(x, 0.1, 10, torch.qint8)
+        self.assertEqual(quantized_tensor.device, torch.device("openreg:0"))
+        self.assertEqual(quantized_tensor.dtype, torch.qint8)
+
+
+class TestAutogradFunction(TestCase):
+    def test_compile_autograd_function_returns_self(self):
+        in_ref = torch.randn(4, device="openreg", requires_grad=True)
+        out_ref = torch.ops.openreg.custom_autograd_fn_returns_self(in_ref)
+        out_ref.sum().backward()
+
+        in_test = in_ref.detach().clone().requires_grad_(True)
+        # TODO(FFFrog): Need to support inductor for OpenReg first.
+        out_test = torch.compile(backend="aot_eager")(
+            torch.ops.openreg.custom_autograd_fn_returns_self
+        )(in_test)
+        out_test.sum().backward()
+
+        self.assertEqual(out_ref, out_test)
+        self.assertEqual(in_ref.grad, in_test.grad)
+
+    @skipIfTorchDynamo("Temporary disabled due to torch._ops.OpOverloadPacket")
+    def test_compile_autograd_function_aliasing(self):
+        in_ref = torch.randn(4, device="openreg", requires_grad=True)
+        out_ref = torch.ops.openreg.custom_autograd_fn_aliasing(in_ref)
+        out_ref.sum().backward()
+
+        in_test = in_ref.detach().clone().requires_grad_(True)
+        # TODO(FFFrog): Need to support inductor for OpenReg first.
+        out_test = torch.compile(backend="aot_eager")(
+            torch.ops.openreg.custom_autograd_fn_aliasing
+        )(in_test)
+        out_test.sum().backward()
+
+        self.assertEqual(out_ref, out_test)
+        self.assertEqual(in_ref.grad, in_test.grad)
+
+
+class TestFallback(TestCase):
+    def test_scalar_type_fallback(self):
+        x_cpu = torch.Tensor([[0, 0, 0, 1, 1, 2], [0, 1, 2, 1, 2, 2]]).to(torch.int64)
+        x = torch.triu_indices(3, 3, device="openreg")
+        self.assertEqual(x_cpu, x)
+
+    def test_tensor_type_fallback(self):
+        x = torch.Tensor([[1, 2, 3], [2, 3, 4]]).to("openreg")
+        y = torch.Tensor([1, 0, 2]).to("openreg")
+        self.assertTrue(x.device.type, "openreg")
+        self.assertFalse(x.is_cpu)
+
+        z_cpu = torch.Tensor([[0, 2, 1], [1, 3, 2]])
+        # call sub op, which will fallback to cpu
+        z = torch.sub(x, y)
+        self.assertEqual(z_cpu, z)
+
+        # call index op, which will fallback to cpu
+        z_cpu = torch.Tensor([3, 1])
+        y = torch.Tensor([1, 0]).long().to("openreg")
+        z = x[y, y]
+        self.assertEqual(z_cpu, z)
+
+    def test_tensorlist_type_fallback(self):
+        # create tensors located in custom device
+        v_openreg = torch.Tensor([1, 2, 3]).to("openreg")
+        # create result tensor located in cpu
+        z_cpu = torch.Tensor([2, 4, 6])
+        # create tensorlist for foreach_add op
+        x = (v_openreg, v_openreg)
+        y = (v_openreg, v_openreg)
+
+        # Check that our device is correct.
+        self.assertTrue(v_openreg.device.type == "openreg")
+        self.assertFalse(v_openreg.is_cpu)
+
+        # call _foreach_add op, which will fallback to cpu
+        z = torch._foreach_add(x, y)
+        self.assertEqual(z_cpu, z[0])
+        self.assertEqual(z_cpu, z[1])
+
+
+class TestSDPA(NNTestCase):
+    @skipIfTorchDynamo()
+    def test_fused_sdp_choice_privateuseone(self):
+        batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
+        make_tensor = functools.partial(torch.rand, device="cpu", dtype=torch.float16)
+        shape = SDPAShape(batch_size, num_heads, seq_len, head_dim)
+        q_cpu, k_cpu, v_cpu = make_tensor(shape), make_tensor(shape), make_tensor(shape)
+        q_privateuse1 = q_cpu.to("openreg")
+        k_privateuse1 = k_cpu.to("openreg")
+        v_privateuse1 = v_cpu.to("openreg")
+        assert (
+            torch._fused_sdp_choice(q_privateuse1, k_privateuse1, v_privateuse1)
+            == SDPBackend.OVERRIDEABLE.value
+        )
+
+    def test_scaled_dot_product_fused_attention_overrideable(self):
+        batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
+        make_tensor = functools.partial(torch.rand, device="cpu", dtype=torch.float16)
+        shape = SDPAShape(batch_size, num_heads, seq_len, head_dim)
+        q_cpu, k_cpu, v_cpu = make_tensor(shape), make_tensor(shape), make_tensor(shape)
+        q_privateuse1 = q_cpu.to("openreg")
+        k_privateuse1 = k_cpu.to("openreg")
+        v_privateuse1 = v_cpu.to("openreg")
+        torch.nn.functional.scaled_dot_product_attention(
+            q_privateuse1, k_privateuse1, v_privateuse1, attn_mask=None, dropout_p=0.0
+        )
+
+    def test_scaled_dot_product_fused_attention_overrideable_backward(self):
+        batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
+        make_tensor = functools.partial(
+            torch.rand, device="cpu", dtype=torch.float16, requires_grad=True
+        )
+        shape = (batch_size, num_heads, seq_len, head_dim)
+        q_cpu, k_cpu, v_cpu = make_tensor(shape), make_tensor(shape), make_tensor(shape)
+        attn_mask = make_tensor((batch_size, num_heads, seq_len, seq_len))
+        q_privateuse1 = q_cpu.to("openreg")
+        k_privateuse1 = k_cpu.to("openreg")
+        v_privateuse1 = v_cpu.to("openreg")
+        attn_mask_privateuse1 = attn_mask.to("openreg")
+        (
+            output,
+            logsumexp,
+            cum_seq_q,
+            cum_seq_k,
+            max_q,
+            max_k,
+            philox_seed,
+            philox_offset,
+            debug_attn_mask,
+        ) = torch.ops.aten._scaled_dot_product_fused_attention_overrideable(
+            q_privateuse1, k_privateuse1, v_privateuse1, attn_bias=attn_mask_privateuse1
+        )
+
+        rand_upward = torch.rand(
+            shape, device="cpu", dtype=torch.float16, requires_grad=False
+        )
+        rand_upward_privateuse1 = rand_upward.to("openreg")
+        grad_input_mask = [True, True, True, True]
+        grad_q, grad_k, grad_v, grad_attn_mask = (
+            torch.ops.aten._scaled_dot_product_fused_attention_overrideable_backward(
+                rand_upward_privateuse1,
+                q_privateuse1,
+                k_privateuse1,
+                v_privateuse1,
+                attn_mask_privateuse1,
+                grad_input_mask,
+                output,
+                logsumexp,
+                cum_seq_q,
+                cum_seq_k,
+                max_q,
+                max_k,
+                dropout_p=0.0,
+                is_causal=False,
+                philox_seed=philox_seed,
+                philox_offset=philox_offset,
+            )
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_rng.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_rng.py
new file mode 100644
index 0000000000000..8a6258408d880
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_rng.py
@@ -0,0 +1,23 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestRNG(TestCase):
+    def test_generator(self):
+        generator = torch.Generator(device="openreg:1")
+        self.assertEqual(generator.device.type, "openreg")
+        self.assertEqual(generator.device.index, 1)
+
+    def test_rng_state(self):
+        state = torch.openreg.get_rng_state(0)
+        torch.openreg.set_rng_state(state, 0)
+
+    def test_manual_seed(self):
+        torch.openreg.manual_seed_all(2024)
+        self.assertEqual(torch.openreg.initial_seed(), 2024)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_storage.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_storage.py
new file mode 100644
index 0000000000000..0167f2039dadc
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_storage.py
@@ -0,0 +1,174 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import _codecs
+import io
+import os
+import tempfile
+import unittest
+
+import numpy
+
+import torch
+from torch.serialization import safe_globals
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfTorchDynamo,
+    TemporaryFileName,
+    TestCase,
+)
+
+
+class TestStorage(TestCase):
+    @skipIfTorchDynamo("unsupported aten.is_pinned.default")
+    def test_rewrapped_storage(self):
+        pinned_a = torch.randn(10).pin_memory()
+        rewrapped_a = torch.tensor((), dtype=torch.float32).set_(
+            pinned_a.untyped_storage()[2:],
+            size=(5,),
+            stride=(1,),
+            storage_offset=0,
+        )
+        self.assertTrue(rewrapped_a.is_pinned())
+        self.assertNotEqual(pinned_a.data_ptr(), rewrapped_a.data_ptr())
+
+
+class TestSerialization(TestCase):
+    def test_serialization(self):
+        storage = torch.UntypedStorage(4, device=torch.device("openreg"))
+        self.assertEqual(torch.serialization.location_tag(storage), "openreg:0")
+
+        storage = torch.UntypedStorage(4, device=torch.device("openreg:0"))
+        self.assertEqual(torch.serialization.location_tag(storage), "openreg:0")
+
+        storage_cpu = torch.empty(4, 4).storage()
+        storage_openreg = torch.serialization.default_restore_location(
+            storage_cpu, "openreg:0"
+        )
+        self.assertTrue(storage_openreg.is_openreg)
+
+        tensor = torch.empty(3, 3, device="openreg")
+        self.assertEqual(torch._utils.get_tensor_metadata(tensor), {})
+        metadata = {"version_number": True, "format_number": True}
+        torch._utils.set_tensor_metadata(tensor, metadata)
+        self.assertEqual(torch._utils.get_tensor_metadata(tensor), metadata)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "data.pt")
+            torch.save(tensor, path)
+
+            tensor_openreg = torch.load(path)
+            self.assertTrue(tensor_openreg.is_openreg)
+            self.assertEqual(torch._utils.get_tensor_metadata(tensor_openreg), metadata)
+
+            tensor_cpu = torch.load(path, map_location="cpu")
+            self.assertFalse(tensor_cpu.is_openreg)
+            self.assertEqual(torch._utils.get_tensor_metadata(tensor_cpu), {})
+
+    @skipIfTorchDynamo()
+    @unittest.skipIf(
+        numpy.__version__ < "1.25",
+        "versions < 1.25 serialize dtypes differently from how it's serialized in data_legacy_numpy",
+    )
+    def test_open_device_numpy_serialization(self):
+        """
+        This tests the legacy _rebuild_device_tensor_from_numpy serialization path
+        """
+        data_legacy_numpy = (
+            b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02}q\x00X\x01"
+            b"\x00\x00\x00xq\x01ctorch._utils\n_rebuild_device_tensor_from_numpy\nq\x02(cnumpy.core.m"
+            b"ultiarray\n_reconstruct\nq\x03cnumpy\nndarray\nq\x04K\x00\x85q\x05c_codecs\nencode\nq\x06"
+            b"X\x01\x00\x00\x00bq\x07X\x06\x00\x00\x00latin1q\x08\x86q\tRq\n\x87q\x0bRq\x0c(K\x01K\x02K"
+            b"\x03\x86q\rcnumpy\ndtype\nq\x0eX\x02\x00\x00\x00f4q\x0f\x89\x88\x87q\x10Rq\x11(K\x03X\x01"
+            b"\x00\x00\x00<q\x12NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00tq\x13b\x89h\x06X\x1c\x00\x00"
+            b"\x00\x00\x00\xc2\x80?\x00\x00\x00@\x00\x00@@\x00\x00\xc2\x80@\x00\x00\xc2\xa0@\x00\x00\xc3"
+            b"\x80@q\x14h\x08\x86q\x15Rq\x16tq\x17bctorch\nfloat32\nq\x18X\t\x00\x00\x00openreg:0q\x19\x89"
+            b"tq\x1aRq\x1bs.PK\x07\x08\xdfE\xd6\xcaS\x01\x00\x00S\x01\x00\x00PK\x03\x04\x00\x00\x08"
+            b"\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00.\x00"
+            b"archive/byteorderFB*\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08"
+            b"\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00=\x00archive/versionFB9\x00"
+            b"ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00"
+            b"\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZ"
+            b"ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0636457737946401051300000025273995036293PK\x07\x08\xee(\xcd"
+            b"\x8d(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00"
+            b"\xdfE\xd6\xcaS\x01\x00\x00S\x01\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00"
+            b"\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\xa3\x01\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00"
+            b"\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x16\x02\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08"
+            b"\x08\x00\x00\x00\x00\x00\x00\xee(\xcd\x8d(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x92\x02\x00\x00archive/.data/serialization_idPK\x06"
+            b"\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00"
+            b"\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x06\x01\x00\x00\x00\x00\x00\x008\x03\x00"
+            b"\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00>\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00"
+            b"PK\x05\x06\x00\x00\x00\x00\x04\x00\x04\x00\x06\x01\x00\x008\x03\x00\x00\x00\x00"
+        )
+        buf_data_legacy_numpy = io.BytesIO(data_legacy_numpy)
+
+        with safe_globals(
+            [
+                (
+                    (
+                        numpy.core.multiarray._reconstruct,
+                        "numpy.core.multiarray._reconstruct",
+                    )
+                    if numpy.__version__ >= "2.1"
+                    else numpy.core.multiarray._reconstruct
+                ),
+                numpy.ndarray,
+                numpy.dtype,
+                _codecs.encode,
+                numpy.dtypes.Float32DType,
+            ]
+        ):
+            sd_loaded = torch.load(buf_data_legacy_numpy, weights_only=True)
+            buf_data_legacy_numpy.seek(0)
+            # Test map_location
+            sd_loaded_cpu = torch.load(
+                buf_data_legacy_numpy, weights_only=True, map_location="cpu"
+            )
+
+        expected = torch.tensor(
+            [[1, 2, 3], [4, 5, 6]], dtype=torch.float32, device="openreg"
+        )
+        self.assertEqual(sd_loaded["x"].cpu(), expected.cpu())
+        self.assertFalse(sd_loaded["x"].is_cpu)
+        self.assertTrue(sd_loaded_cpu["x"].is_cpu)
+
+    def test_open_device_cpu_serialization(self):
+        default_protocol = torch.serialization.DEFAULT_PROTOCOL
+
+        with unittest.mock.patch.object(torch._C, "_has_storage", return_value=False):
+            x = torch.randn(2, 3)
+            x_openreg = x.to("openreg")
+            sd = {"x": x_openreg}
+            rebuild_func = x_openreg._reduce_ex_internal(default_protocol)[0]
+            self.assertTrue(
+                rebuild_func is torch._utils._rebuild_device_tensor_from_cpu_tensor
+            )
+
+            # Test map_location
+            with TemporaryFileName() as f:
+                torch.save(sd, f)
+                sd_loaded = torch.load(f, weights_only=True)
+                # Test map_location
+                sd_loaded_cpu = torch.load(f, weights_only=True, map_location="cpu")
+            self.assertFalse(sd_loaded["x"].is_cpu)
+            self.assertEqual(sd_loaded["x"].cpu(), x)
+            self.assertTrue(sd_loaded_cpu["x"].is_cpu)
+
+            # Test metadata_only
+            with TemporaryFileName() as f:
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Cannot serialize tensors on backends with no storage under skip_data context manager",
+                ):
+                    with torch.serialization.skip_data():
+                        torch.save(sd, f)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_streams.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_streams.py
new file mode 100644
index 0000000000000..9220fefd6902c
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_streams.py
@@ -0,0 +1,72 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+
+
+class TestStream(TestCase):
+    def test_stream_create(self):
+        stream = torch.Stream(device="openreg")
+        self.assertEqual(stream.device_index, torch.openreg.current_device())
+
+        stream = torch.Stream(device="openreg:1")
+        self.assertEqual(stream.device.type, "openreg")
+        self.assertEqual(stream.device_index, 1)
+
+        stream = torch.Stream(1)
+        self.assertEqual(stream.device.type, "openreg")
+        self.assertEqual(stream.device_index, 1)
+
+        stream1 = torch.Stream(
+            stream_id=stream.stream_id,
+            device_type=stream.device_type,
+            device_index=stream.device_index,
+        )
+        self.assertEqual(stream, stream1)
+
+    def test_stream_context(self):
+        with torch.Stream(device="openreg:1") as stream:
+            self.assertEqual(torch.accelerator.current_stream(), stream)
+
+    @skipIfTorchDynamo()
+    def test_stream_switch(self):
+        stream1 = torch.Stream(device="openreg:0")
+        torch.accelerator.set_stream(stream1)
+        current_stream = torch.accelerator.current_stream()
+        self.assertEqual(current_stream, stream1)
+
+        stream2 = torch.Stream(device="openreg:1")
+        torch.accelerator.set_stream(stream2)
+        current_stream = torch.accelerator.current_stream()
+        self.assertEqual(current_stream, stream2)
+
+    def test_stream_synchronize(self):
+        stream = torch.Stream(device="openreg:1")
+        self.assertEqual(True, stream.query())
+
+        event = torch.Event()
+        event.record(stream)
+        stream.synchronize()
+        self.assertEqual(True, stream.query())
+
+    def test_stream_repr(self):
+        stream = torch.Stream(device="openreg:1")
+        self.assertTrue(
+            "torch.Stream device_type=openreg, device_index=1" in repr(stream)
+        )
+
+    def test_stream_wait_stream(self):
+        stream_1 = torch.Stream(device="openreg:0")
+        stream_2 = torch.Stream(device="openreg:1")
+        stream_2.wait_stream(stream_1)
+
+    @skipIfTorchDynamo()
+    def test_stream_wait_event(self):
+        s1 = torch.Stream(device="openreg")
+        s2 = torch.Stream(device="openreg")
+        e = s1.record_event()
+        s2.wait_event(e)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_utils.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_utils.py
new file mode 100644
index 0000000000000..e6a6093e43340
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_utils.py
@@ -0,0 +1,20 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestDLPack(TestCase):
+    def test_open_device_dlpack(self):
+        x_in = torch.randn(2, 3).to("openreg")
+        capsule = torch.utils.dlpack.to_dlpack(x_in)
+        x_out = torch.from_dlpack(capsule)
+        self.assertTrue(x_out.device == x_in.device)
+
+        x_in = x_in.to("cpu")
+        x_out = x_out.to("cpu")
+        self.assertEqual(x_in, x_out)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
index 0cee2c87ea34b..29cbdd707cdd9 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
@@ -125,6 +125,7 @@ Please refer to [example](example/example.cpp) for example.
 The command to compile example.cpp is as follow:
 
 ```Shell
+# The same directory as the current README.md file.
 mkdir build
 
 pushd build
@@ -132,7 +133,7 @@ cmake ..
 make -j 32
 popd
 
-g++ -o out example/example.cpp -L ./build -lopenreg
+g++ -o out example/example.cpp -L ./build -lopenreg -I ./
 LD_LIBRARY_PATH=./build ./out
 ```
 
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
index f00f1909b7ec6..fbcf08ac6f29c 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
@@ -1,6 +1,5 @@
 #include "include/openreg.h"
 
-#include <algorithm>
 #include <iostream>
 #include <numeric>
 #include <vector>
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py
index 45b2343070fe1..874dbf79b2e8f 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py
@@ -9,7 +9,6 @@
     _load_dll_libraries()
     del _load_dll_libraries
 
-
 import torch_openreg._C  # type: ignore[misc]
 import torch_openreg.openreg
 
@@ -17,3 +16,12 @@
 torch.utils.rename_privateuse1_backend("openreg")
 torch._register_device_module("openreg", torch_openreg.openreg)
 torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
+
+
+# LITERALINCLUDE START: AUTOLOAD
+def _autoload():
+    # It is a placeholder function here to be registered as an entry point.
+    pass
+
+
+# LITERALINCLUDE END: AUTOLOAD
diff --git a/test/cpp_extensions/python_agnostic_extension/python_agnostic/__init__.py b/test/cpp_extensions/python_agnostic_extension/python_agnostic/__init__.py
index b416bca020672..1fec358448064 100644
--- a/test/cpp_extensions/python_agnostic_extension/python_agnostic/__init__.py
+++ b/test/cpp_extensions/python_agnostic_extension/python_agnostic/__init__.py
@@ -15,12 +15,19 @@
 # The following is used to assert the ultra_norm op is properly loaded and
 # calculates correct results upon import of this extension.
 
+if torch.cuda.is_available():
+    device = "cuda"
+elif torch.xpu.is_available():
+    device = "xpu"
+else:
+    raise AssertionError("Expected CUDA or XPU device backend, found none")
+
 inputs = [
-    torch.tensor([1.0, 2.0, 3.0], device="cuda"),
-    torch.tensor([-4.0, -5.0, -6.0], device="cuda"),
+    torch.tensor([1.0, 2.0, 3.0], device=device),
+    torch.tensor([-4.0, -5.0, -6.0], device=device),
 ]
 
 assert torch.equal(
     ops.ultra_norm(inputs),
-    torch.norm(torch.tensor([1.0, 2.0, 3.0, -4.0, -5.0, -6.0], device="cuda")),
+    torch.norm(torch.tensor([1.0, 2.0, 3.0, -4.0, -5.0, -6.0], device=device)),
 )
diff --git a/test/cpp_extensions/python_agnostic_extension/python_agnostic/csrc/ultra_norm.sycl b/test/cpp_extensions/python_agnostic_extension/python_agnostic/csrc/ultra_norm.sycl
new file mode 100644
index 0000000000000..e4557cbf59b2e
--- /dev/null
+++ b/test/cpp_extensions/python_agnostic_extension/python_agnostic/csrc/ultra_norm.sycl
@@ -0,0 +1,19 @@
+#include <ATen/ops/_foreach_norm_native.h>
+#include <ATen/ops/cat_xpu_dispatch.h>
+#include <ATen/ops/norm_xpu_dispatch.h>
+#include <ATen/ops/unsqueeze.h>
+#include <torch/library.h>
+
+at::Tensor ultra_norm(at::TensorList inputs) {
+    auto res = at::native::foreach_tensor_norm_xpu(inputs);
+    std::vector<at::Tensor> unsqueezed;
+    for (const auto& scalar_tensor : res) {
+        unsqueezed.push_back(at::unsqueeze(scalar_tensor, 0));
+    }
+    auto stacked = at::xpu::cat(unsqueezed);
+    return at::xpu::norm(stacked, 2, at::IntArrayRef{}, false);
+}
+
+TORCH_LIBRARY_IMPL(python_agnostic, XPU, m) {
+  m.impl("python_agnostic::ultra_norm", &ultra_norm);
+}
diff --git a/test/cpp_extensions/python_agnostic_extension/setup.py b/test/cpp_extensions/python_agnostic_extension/setup.py
index c81ec9aec41dc..007e0ac689942 100644
--- a/test/cpp_extensions/python_agnostic_extension/setup.py
+++ b/test/cpp_extensions/python_agnostic_extension/setup.py
@@ -9,7 +9,8 @@
 
 from setuptools import setup
 
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import torch
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, SyclExtension
 
 
 ROOT_DIR = Path(__file__).parent
@@ -40,10 +41,17 @@ def get_extension():
         "cxx": ["-fdiagnostics-color=always"],
     }
 
-    sources = list(CSRC_DIR.glob("**/*.cu"))
+    if torch.cuda.is_available():
+        sources = list(CSRC_DIR.glob("**/*.cu"))
+        extension = CUDAExtension
+    elif torch.xpu.is_available():
+        sources = list(CSRC_DIR.glob("**/*.sycl"))
+        extension = SyclExtension
+    else:
+        raise AssertionError("Expected CUDA or XPU device backend, found none")
 
     return [
-        CUDAExtension(
+        extension(
             "python_agnostic._C",
             sources=sorted(str(s) for s in sources),
             py_limited_api=True,
diff --git a/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py b/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
index a64ddc8e440e9..58a8dafb305a3 100644
--- a/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
+++ b/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
@@ -7,11 +7,15 @@
 import unittest
 from pathlib import Path
 
-from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests,
-    onlyCUDA,
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_utils import (
+    IS_LINUX,
+    run_tests,
+    shell,
+    TEST_XPU,
+    TestCase,
 )
-from torch.testing._internal.common_utils import IS_LINUX, run_tests, shell, TestCase
 
 
 class TestPythonAgnostic(TestCase):
@@ -29,7 +33,10 @@ def setUpClass(cls):
         if return_code != 0:
             raise RuntimeError("python_agnostic bdist_wheel failed to build")
 
-    @onlyCUDA
+    @unittest.skipIf(
+        not (TEST_CUDA or TEST_XPU),
+        "test requires CUDA or XPU",
+    )
     @unittest.skipIf(not IS_LINUX, "test requires linux tools ldd and nm")
     def test_extension_is_python_agnostic(self, device):
         # For this test, run_test.py will call `python setup.py bdist_wheel` in the
@@ -59,7 +66,10 @@ def test_extension_is_python_agnostic(self, device):
         self.assertFalse("Py" in missing_symbols)
 
 
-instantiate_device_type_tests(TestPythonAgnostic, globals(), only_for="cuda")
+devices = ("cuda", "xpu")
+instantiate_device_type_tests(
+    TestPythonAgnostic, globals(), only_for=devices, allow_xpu=True
+)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
index 6c7a16608e195..87e056c02e562 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
@@ -12,7 +12,7 @@
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLPStack
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -123,6 +123,7 @@ def world_size(self) -> int:
         return min(torch.get_device_module(device_type).device_count(), 4)
 
     @skip_if_lt_x_gpu(4)
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1661
     def test_clip_grad_norm_2d(self):
         for norm_type in (2, 1, 3, float("inf")):
             dp_size = 2
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
index c52c1e539ff6d..5ae26ae9b9766 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -5,6 +5,7 @@
 import itertools
 import os
 import tempfile
+import unittest
 from typing import Callable, Optional, Union
 from unittest.mock import MagicMock
 
@@ -54,7 +55,7 @@
     patch_reshard,
     patch_unshard,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -414,6 +415,7 @@ def test_manual_reshard_with_reshard_after_forward_false(self):
         )
 
     @skip_if_lt_x_gpu(2)
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1571
     def test_set_reduce_scatter_divide_factor(self):
         self.run_subtests(
             {"divide_factor": [self.world_size * 2, self.world_size]},
@@ -1454,6 +1456,9 @@ def _run(cls, *args, **kwargs):
 
     # Test reduce-scatter only on plain FSDP on 2 GPUs
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        TEST_XPU, "Related environment variable is not supported with XCCL"
+    )
     def test_fully_shard_force_sum_reduce_scatter(self):
         torch.manual_seed(42)
         model_args = ModelArgs()
@@ -1506,6 +1511,9 @@ def test_fully_shard_force_sum_reduce_scatter(self):
 
     # Test both reduce-scatter and all-reduce on HSDP (DDP+FSDP) on 4 GPUs
     @skip_if_lt_x_gpu(4)
+    @unittest.skipIf(
+        TEST_XPU, "Related environment variable is not supported with XCCL"
+    )
     def test_fully_shard_force_sum_both_reductions(self):
         mesh = init_device_mesh(
             device_type.type, (2, self.world_size // 2), mesh_dim_names=("ddp", "fsdp")
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
index b64d4107ee0ca..630e20a2540fe 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -133,7 +133,7 @@ def skipTestForOldSm(self):
             device_type.type,
             self.rank % torch.get_device_module(device_type).device_count(),
         )
-        if not sm_is_or_higher_than(device, 8, 0):
+        if device_type.type == "cuda" and not sm_is_or_higher_than(device, 8, 0):
             self.skipTest("bf16 requires sm >= 8.0")
 
     def test_dynamo_trace_use_training_state(self):
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_memory.py b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
index 44d05ade98f75..eda7468c833da 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_memory.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
@@ -8,7 +8,12 @@
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, OffloadPolicy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
-from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_CUDA,
+    TEST_HPU,
+    TEST_XPU,
+)
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -236,14 +241,15 @@ def test_fully_shard_del_memory(self):
 
     def _get_peak_active_memory_mb(self) -> int:
         mem_stats = torch.get_device_module(device_type).memory_stats()
-        if TEST_CUDA:
+
+        if TEST_CUDA or TEST_XPU:
             return round(mem_stats["active_bytes.all.peak"] / 1e6)
         if TEST_HPU:
             return round(mem_stats["MaxInUse"] / 1e6)
 
     def _get_curr_active_memory_mb(self) -> int:
         mem_stats = torch.get_device_module(device_type).memory_stats()
-        if TEST_CUDA:
+        if TEST_CUDA or TEST_XPU:
             return round(mem_stats["active_bytes.all.current"] / 1e6)
         if TEST_HPU:
             return round(mem_stats["InUse"] / 1e6)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
index af25d4f35fd1e..212420c784516 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
@@ -614,7 +614,7 @@ def forward(self, input: Input):
             torch.bfloat16, torch.bfloat16, torch.bfloat16, True
         )
         model = Model()
-        inp = Input(torch.randn(2, 10).cuda())
+        inp = Input(torch.randn(2, 10).to(device_type))
 
         fully_shard(model, mp_policy=mp_policy)
         loss = model(inp).sum()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
index 800076d3225d5..d193d65b179a5 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
@@ -117,6 +117,49 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
         for key, value in ref_sharded_sd.items():
             self.assertEqual(value, sharded_sd[key])
 
+    @skip_if_lt_x_gpu(2)
+    def test_cached_state_dict(self):
+        self.run_subtests(
+            {"mlp_dim": [2, 3, 4, 5], "mutate_after_state_dict": [True, False]},
+            self._test_cached_state_dict,
+        )
+
+    def _test_cached_state_dict(self, mlp_dim: int, mutate_after_state_dict: bool):
+        torch.manual_seed(42)
+        model = nn.Linear(mlp_dim, mlp_dim, bias=False)
+        fully_shard(model, reshard_after_forward=True)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
+
+        # call .state_dict() once and use `sd` directly to reduce cpu overhead
+        sd = model.state_dict()
+        assert isinstance(model.weight, DTensor)
+
+        if not mutate_after_state_dict:
+            self.assertTrue(
+                sd["weight"]._local_tensor.untyped_storage().data_ptr()
+                == model.weight._local_tensor.untyped_storage().data_ptr()
+            )
+        else:
+            model = model.cpu()
+            model = model.cuda()
+            self.assertTrue(
+                sd["weight"]._local_tensor.untyped_storage().data_ptr()
+                != model.weight._local_tensor.untyped_storage().data_ptr()
+            )
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.rand(mlp_dim, mlp_dim, device="cuda")
+        for _ in range(5):
+            optim.zero_grad()
+            loss = model(inp).sum()
+            loss.backward()
+            optim.step()
+            if not mutate_after_state_dict:
+                self.assertTrue(
+                    sd["weight"]._local_tensor.untyped_storage().data_ptr()
+                    == model.weight._local_tensor.untyped_storage().data_ptr()
+                )
+
     @skip_if_lt_x_gpu(2)
     def test_dp_state_dict_cpu_offload(self):
         self.run_subtests(
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
index 3991fda639108..3d02e053edd2a 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -42,7 +42,9 @@
     get_cycles_per_ms,
     run_tests,
     TEST_HPU,
+    TEST_XPU,
     wrapSwapTensorsTest,
+    xfailIf,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
@@ -324,7 +326,7 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
+    @unittest.skipIf(TEST_HPU or TEST_XPU, "Sleep kernel not supported for HPU/XPU")
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
     def test_train_parity_multi_group(self):
         """
@@ -347,7 +349,7 @@ def test_train_parity_multi_group(self):
         )
 
     @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
+    @unittest.skipIf(TEST_HPU or TEST_XPU, "sleep kernel not supported on HPU/XPU")
     def test_train_parity_multi_group_cpu_offload_eager(self):
         """
         Tests train parity against DDP when using multiple parameter groups for
@@ -371,7 +373,7 @@ def test_train_parity_multi_group_cpu_offload_eager(self):
         )
 
     @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
+    @unittest.skipIf(TEST_HPU or TEST_XPU, "sleep kernel not supported on HPU/XPU")
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
     def test_train_parity_multi_group_unshard_async_op(self):
         """
@@ -495,6 +497,7 @@ def delayed_reduce_scatter(*args, **kwargs):
                 self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_XPU, "Sleep is not supported on XPU")
     def test_non_root_forward_backward(self):
         """
         Tests running forward/backward through the root and then through a
@@ -625,7 +628,7 @@ def test_explicit_prefetching(self):
             self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
+    @unittest.skipIf(TEST_HPU or TEST_XPU, "Sleep is not supported on HPU/XPU")
     def test_post_optim_event(self):
         torch.manual_seed(42)
         model_args = ModelArgs(dropout_p=0.0)
@@ -678,6 +681,7 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(2)
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1661
     def test_train_parity_with_activation_checkpointing(self):
         """
         Tests train parity against DDP when composing with activation
@@ -1486,8 +1490,8 @@ def world_size(self) -> int:
     @skip_if_lt_x_gpu(1)
     def test_train_parity_single_worldsize1(self):
         """
-        Tests train parity with DDP for a single FSDP group when sharding
-        parameters on dim-0.
+        Tests train parity with DDP for a single FSDP group
+        when sharding parameters on dim-0.
         """
         self.run_subtests(
             {
@@ -1535,9 +1539,7 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
                 losses.append(model(*inp).sum())
                 losses[-1].backward()
 
-            # Before there was 1 all-gather and 1 reduce-scatter
-            # Now therre is 1 reduce-scatter
-            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(comm_mode.get_total_counts(), 0)
             optim.step()
 
             self.assertEqual(losses[0], losses[1])
diff --git a/test/distributed/_composable/test_checkpoint.py b/test/distributed/_composable/test_checkpoint.py
index f30f8c34f6137..7834328f1e359 100644
--- a/test/distributed/_composable/test_checkpoint.py
+++ b/test/distributed/_composable/test_checkpoint.py
@@ -10,10 +10,13 @@
 import torch.nn as nn
 from torch.distributed._composable import checkpoint
 from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
 from torch.utils.checkpoint import CheckpointError
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 class MemoryDelta(ContextDecorator):
     def __init__(self, device: torch.device):
         self.device: torch.device = device
@@ -22,16 +25,16 @@ def __init__(self, device: torch.device):
 
     def __enter__(self):
         self.active_memory_enter = (
-            torch.cuda.memory_stats()["active_bytes.all.current"]
-            if self.device.type == "cuda"
+            torch.accelerator.memory_stats()["active_bytes.all.current"]
+            if self.device.type == "cuda" or self.device.type == "xpu"
             else 0
         )
         return self
 
     def __exit__(self, *exc):
         self.active_memory_exit = (
-            torch.cuda.memory_stats()["active_bytes.all.current"]
-            if self.device.type == "cuda"
+            torch.accelerator.memory_stats()["active_bytes.all.current"]
+            if self.device.type == "cuda" or self.device.type == "xpu"
             else 0
         )
 
@@ -126,7 +129,7 @@ def _test_tensor_only(
             loss2 = net2(x2).sum()
         loss2.backward()
 
-        if x.is_cuda:
+        if x.is_cuda or x.is_xpu:
             self.assertTrue(mem2.delta() < mem1.delta())
 
         for p1, p2 in zip(net1.parameters(), net2.parameters()):
@@ -137,10 +140,10 @@ def test_tensor_only_cpu(self):
         net = ToyModel()
         self._test_tensor_only(net, x)
 
-    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "no cuda/xpu")
     def test_tensor_only_gpu(self):
-        x = torch.randn(20, 100, device="cuda:0")
-        net = ToyModel().to("cuda:0")
+        x = torch.randn(20, 100, device=f"{device_type}:0")
+        net = ToyModel().to(f"{device_type}:0")
         self._test_tensor_only(net, x)
 
     def test_random_cpu(self):
diff --git a/test/distributed/_composable/test_composability/test_2d_composability.py b/test/distributed/_composable/test_composability/test_2d_composability.py
index bcaf06ea947a0..925f3a647fef2 100644
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@@ -47,6 +47,8 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_XPU,
+    xfailIf,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -58,6 +60,9 @@
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 class SimpleModel(nn.Module):
     def __init__(self):
         super().__init__()
@@ -73,7 +78,7 @@ def forward(self, x):
         return x
 
     def get_input(self):
-        return torch.rand(4, 5, device="cuda")
+        return torch.rand(4, 5, device=device_type)
 
 
 class SimpleModelUneven(nn.Module):
@@ -94,7 +99,7 @@ def forward(self, x):
         return x
 
     def get_input(self):
-        return torch.rand(4, 5, device="cuda")
+        return torch.rand(4, 5, device=device_type)
 
 
 class TestFullyShard2DTraining(FSDPTest):
@@ -105,13 +110,15 @@ class TestFullyShard2DTraining(FSDPTest):
 
     @property
     def world_size(self) -> int:
-        return min(4, torch.cuda.device_count())
+        return min(4, torch.accelerator.device_count())
 
     def init_global_mesh(self) -> DeviceMesh:
         # Prefer to test with >=4 GPUs, but for 2 GPUs, use 2-way TP
         dp_size = 2 if self.world_size > 2 else 1
         return init_device_mesh(
-            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+            device_type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
         )
 
     @skip_if_lt_x_gpu(2)
@@ -138,7 +145,7 @@ def _test_train_parity_2d_mlp(
 
         torch.manual_seed(42)
         model = MLPStack(mlp_dim)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
         replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
         model.parallelize(
@@ -150,9 +157,8 @@ def _test_train_parity_2d_mlp(
         optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=False)
 
         torch.manual_seed(42 + dp_pg.rank() + 1)
-        device = torch.device("cuda")
         for iter_idx in range(10):
-            inp = torch.randn((8, mlp_dim), device=device)
+            inp = torch.randn((8, mlp_dim), device=device_type)
             losses: list[torch.Tensor] = []
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
                 _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
@@ -162,6 +168,7 @@ def _test_train_parity_2d_mlp(
             self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1881
     def test_train_parity_2d_transformer(self):
         self.run_subtests(
             {"use_shard_placement_fn": [False, True]},
@@ -172,12 +179,12 @@ def _test_train_parity_2d_transformer(self, use_shard_placement_fn: bool):
         torch.manual_seed(42)
         model_args = ModelArgs(n_layers=3, dropout_p=0.0)
         model = Transformer(model_args)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
         ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
 
         dp_size, tp_size = self.world_size // 2, 2
         global_mesh = init_device_mesh(
-            "cuda", (dp_size, tp_size), mesh_dim_names=("dp", "tp")
+            device_type, (dp_size, tp_size), mesh_dim_names=("dp", "tp")
         )
         model = Transformer.parallelize(model, global_mesh["tp"], use_seq_parallel=True)
 
@@ -205,7 +212,7 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(full_param, ref_param)
 
         torch.manual_seed(42 + global_mesh.get_local_rank("dp"))
-        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type)
         for iter_idx in range(5):
             ref_loss = ref_model(inp).sum()
             loss = model(inp).sum()
@@ -242,15 +249,16 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(full_param, ref_param)
 
     @skip_if_lt_x_gpu(2)
+    @xfailIf(TEST_XPU)  # https://github.com/pytorch/pytorch/issues/156782
     def test_tp_with_fsdp_offloading(self):
         global_mesh = init_device_mesh(
-            "cuda", (1, self.world_size), mesh_dim_names=("dp", "tp")
+            device_type, (1, self.world_size), mesh_dim_names=("dp", "tp")
         )
         dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
         torch.manual_seed(42)
         mlp_dim = 16
         model = MLPStack(mlp_dim)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
         # Parallelize with N-way TP and 1-way FSDP
         model.parallelize(
@@ -268,7 +276,7 @@ def test_tp_with_fsdp_offloading(self):
         # NOTE: We still see the FSDP all-gather/reduce-scatter c10d ops
         # called, but they will just be no-ops without issuing any kernels.
         # We prefer to keep the no-op check at the c10d level, not in FSDP.
-        inp = torch.randn((4, mlp_dim), device="cuda")  # same on all ranks
+        inp = torch.randn((4, mlp_dim), device=device_type)  # same on all ranks
         for _ in range(10):
             ref_optim.zero_grad()
             optim.zero_grad()
@@ -286,17 +294,18 @@ def test_tp_with_fsdp_offloading(self):
             with CommDebugMode() as bwd_comm_mode:
                 loss.backward()
             bwd_comm_counts = bwd_comm_mode.get_comm_counts()
-            self.assertEqual(len(bwd_comm_counts), 2)
+            self.assertEqual(len(bwd_comm_counts), 1)
             # First MLP's input gradient does not need to be all-reduced
             self.assertEqual(bwd_comm_counts[funcol.all_reduce], num_mlps - 1)
             self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], 0)
-            self.assertEqual(bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_mlps)
+            self.assertEqual(bwd_comm_counts[c10d_ops._reduce_scatter_base_], 0)
             ref_loss.backward()
 
             optim.step()
             ref_optim.step()
 
     @skip_if_lt_x_gpu(2)
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1881
     @with_temp_dir
     def test_train_parity_2d_transformer_checkpoint_resume(self):
         """
@@ -352,7 +361,7 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool):
         )
 
         torch.manual_seed(42 + global_mesh["dp"].get_local_rank() + 1)
-        inp = torch.randint(0, model_args.vocab_size, (3, 16), device="cuda")
+        inp = torch.randint(0, model_args.vocab_size, (3, 16), device=device_type)
         loss_no_cp1 = train_step(model_no_cp, optim_no_cp, inp)
         loss_no_cp2 = train_step(model_no_cp, optim_no_cp, inp)
 
@@ -410,14 +419,14 @@ class TestFullyShard2DStateDict(DTensorTestBase):
     @property
     def backend(self):
         # need to specify gloo backend for testing cpu offload
-        return "cpu:gloo,cuda:nccl"
+        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
 
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_fully_shard_tp_2d_set_full_state_dict(self):
-        dummy_model = SimpleModel().cuda()
+        dummy_model = SimpleModel().to(device_type)
         mesh_2d = init_device_mesh(
-            "cuda",
+            device_type,
             (2, self.world_size // 2),
             mesh_dim_names=("dp", "tp"),
         )
@@ -561,7 +570,7 @@ def test_2d_fsdp_state_enable_extension(self):
             self.device_type, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
         )
         model = FSDP(
-            SimpleModel().cuda(),
+            SimpleModel().to(device_type),
             device_mesh=mesh_2d["dp"],
         )
         fsdp_state = _get_module_fsdp_state(model)
@@ -573,7 +582,7 @@ def _test_2d_e2e_training(
         recompute_activation=False,
     ) -> None:
         torch.manual_seed(0)
-        model = SimpleModel().cuda(self.rank)
+        model = SimpleModel().to(f"{device_type}:{self.rank}")
         model = FSDP(model, use_orig_params=use_orig_params)
         optim = torch.optim.Adam(model.parameters(), lr=0.01)
 
@@ -587,7 +596,9 @@ def _test_2d_e2e_training(
             "net1": ColwiseParallel(),
             "net2": RowwiseParallel(),
         }
-        model_2d = parallelize_module(SimpleModel().cuda(), tp_mesh, parallelize_plan)
+        model_2d = parallelize_module(
+            SimpleModel().to(device_type), tp_mesh, parallelize_plan
+        )
         model_2d = FSDP(
             model_2d,
             device_mesh=dp_mesh,
@@ -615,7 +626,7 @@ def _test_2d_e2e_training(
             # Ensure all input across TP ranks are same.
             # TODO: add a get_group_rank() to DeviceMesh.
             torch.manual_seed(i + dist.get_rank(dp_mesh.get_group(mesh_dim=0)))
-            input = torch.rand(4, 5).cuda(self.rank)
+            input = torch.rand(4, 5).to(f"{device_type}:{self.rank}")
             output = model(input)
             output_2d = model_2d(input)
             self.assertEqual(output, output_2d)
@@ -652,7 +663,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
     @property
     def backend(self):
         # need to specify gloo backend for testing cpu offload
-        return "cpu:gloo,cuda:nccl"
+        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -669,7 +680,7 @@ def test_fsdp_2d_extension(self):
             "net3": ColwiseParallel(),
         }
         model_2d = parallelize_module(
-            SimpleModel().cuda(),
+            SimpleModel().to(device_type),
             mesh_2d["tp"],
             parallelize_plan=parallelize_plan,
         )
@@ -679,8 +690,10 @@ def test_fsdp_2d_extension(self):
             isinstance(model_2d_fsdp_state._fsdp_extension, DTensorExtensions)
         )
 
-        mesh_1d = init_device_mesh("cuda", (self.world_size,))
-        model_1d = FSDP(SimpleModel().cuda(), device_mesh=mesh_1d, use_orig_params=True)
+        mesh_1d = init_device_mesh(device_type, (self.world_size,))
+        model_1d = FSDP(
+            SimpleModel().to(device_type), device_mesh=mesh_1d, use_orig_params=True
+        )
         model_1d_fsdp_state = _get_module_fsdp_state(model_1d)
         self.assertEqual(model_1d_fsdp_state._fsdp_extension, None)
 
@@ -692,7 +705,7 @@ def test_2d_state_dict(self, is_even_sharded_model):
 
         # Create a model without wrapper
         torch.manual_seed(0)
-        no_wrap_model = simple_model().cuda(self.rank)
+        no_wrap_model = simple_model().to(f"{device_type}:{self.rank}")
         no_wrap_state_dict = no_wrap_model.state_dict()
 
         # Create a model and sharded it with 2D FSDP + TP
@@ -706,7 +719,9 @@ def test_2d_state_dict(self, is_even_sharded_model):
             "net1": ColwiseParallel(),
             "net2": RowwiseParallel(),
         }
-        model_2d = parallelize_module(simple_model().cuda(), tp_mesh, parallelize_plan)
+        model_2d = parallelize_module(
+            simple_model().to(device_type), tp_mesh, parallelize_plan
+        )
         model_2d = FSDP(model_2d, device_mesh=dp_mesh, use_orig_params=True)
 
         FSDP.set_state_dict_type(
@@ -754,7 +769,9 @@ def test_2d_load_state_dict(self, is_even_sharded_model):
             "net1": ColwiseParallel(),
             "net2": RowwiseParallel(),
         }
-        model_2d = parallelize_module(simple_model().cuda(), tp_mesh, parallelize_plan)
+        model_2d = parallelize_module(
+            simple_model().to(device_type), tp_mesh, parallelize_plan
+        )
         model_2d = FSDP(model_2d, device_mesh=dp_mesh, use_orig_params=True)
         optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.01)
 
@@ -768,7 +785,7 @@ def test_2d_load_state_dict(self, is_even_sharded_model):
         ref_state_dict = deepcopy(model_2d.state_dict())
 
         # Update the parameters so model.state_dict() will be different from ref_dtensor_sd.
-        model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
+        model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
         optim_2d.step()
 
         # Load ref_state_dict back.
@@ -799,9 +816,11 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
 
         # Create a model without wrapper
         torch.manual_seed(0)
-        no_wrap_model = simple_model().cuda(self.rank)
+        no_wrap_model = simple_model().to(f"{device_type}:{self.rank}")
         no_wrap_optim = torch.optim.Adam(no_wrap_model.parameters(), lr=0.01)
-        no_wrap_model(no_wrap_model.get_input().cuda(self.rank)).sum().backward()
+        no_wrap_model(
+            no_wrap_model.get_input().to(f"{device_type}:{self.rank}")
+        ).sum().backward()
         no_wrap_optim.step()
         no_wrap_osd = get_optimizer_state_dict(no_wrap_model, optimizers=no_wrap_optim)
 
@@ -815,7 +834,7 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
             "net2": RowwiseParallel(),
         }
         model_2d = parallelize_module(
-            simple_model().cuda(), mesh_2d["tp"], parallelize_plan
+            simple_model().to(device_type), mesh_2d["tp"], parallelize_plan
         )
         model_2d = FSDP(model_2d, device_mesh=mesh_2d["dp"], use_orig_params=True)
         FSDP.set_state_dict_type(
@@ -823,7 +842,7 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
             StateDictType.SHARDED_STATE_DICT,
         )
         optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.01)
-        model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
+        model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
         optim_2d.step()
         optim_2d_osd = get_optimizer_state_dict(model_2d, optimizers=optim_2d)
         ref_optim_2d_osd = deepcopy(optim_2d_osd)
@@ -842,7 +861,7 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
                 # compare with no_wrap state.
                 if isinstance(dist_state, DTensor):
                     dist_state = (
-                        dist_state.cuda()
+                        dist_state.to(device_type)
                         .redistribute(placements=(Replicate(), Replicate()))
                         .to_local()
                     )
@@ -850,7 +869,7 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
                 self.assertTrue(torch.allclose(state, dist_state))
 
         # Update the parameters 2d optim states will be different from ref_optim_state_dict.
-        model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
+        model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
         optim_2d.step()
 
         set_optimizer_state_dict(
@@ -892,8 +911,8 @@ def test_fsdp1_tp_2d_set_full_state_dict(self):
         5) dcp.load the state dict from storage
         6) load the state dict into the 2D model
         """
-        dummy_model = SimpleModel().cuda()
-        mesh_1d = init_device_mesh("cuda", (self.world_size,))
+        dummy_model = SimpleModel().to(device_type)
+        mesh_1d = init_device_mesh(device_type, (self.world_size,))
         model = FSDP(dummy_model, device_mesh=mesh_1d)
         optim = torch.optim.Adam(model.parameters(), lr=0.01)
         model(model.get_input()).sum().backward()
@@ -911,9 +930,9 @@ def test_fsdp1_tp_2d_set_full_state_dict(self):
         dcp.save(state_dict, checkpoint_id=self.temp_dir)
 
         # initialize 2d model
-        dummy_model = SimpleModel().cuda()
+        dummy_model = SimpleModel().to(device_type)
         mesh_2d = init_device_mesh(
-            "cuda",
+            device_type,
             (2, self.world_size // 2),
             mesh_dim_names=("dp", "tp"),
         )
diff --git a/test/distributed/_composable/test_composability/test_pp_composability.py b/test/distributed/_composable/test_composability/test_pp_composability.py
index 8f0b938da41b0..e4daa81c456c0 100644
--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@@ -30,7 +30,7 @@
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
-    requires_nccl,
+    requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
@@ -38,6 +38,7 @@
     parametrize,
     run_tests,
     skip_but_pass_in_sandcastle_if,
+    TEST_XPU,
 )
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
@@ -46,6 +47,10 @@
     from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+backend = torch.distributed.get_default_backend_for_device(device_type)
+
+
 # MLP Layer
 class MLPModule(torch.nn.Module):
     def __init__(self, d_hid: int):
@@ -79,7 +84,7 @@ class ComposabilityTest(MultiProcessTestCase):
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
-        return "nccl"
+        return backend
 
     def setUp(self):
         super().setUp()
@@ -100,9 +105,11 @@ def world_size(self):
     def device(self):
         return self.rank
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(4)
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 4+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs"
+    )
     def test_pp_and_dcp(self):
         """
         Test that pipeline parallelism and distributed checkpointing can be used together and
@@ -143,11 +150,11 @@ def forward(self, x):
                     x = layer(x)
                 return x
 
-        device = torch.device("cuda", self.device)
-        torch.cuda.set_device(self.device)
+        device = torch.device(device_type, self.device)
+        torch.accelerator.set_device_index(self.device)
         store = torch.distributed.FileStore(self.file_name, self.world_size)
         torch.distributed.init_process_group(
-            backend="nccl",
+            backend=backend,
             store=store,
             rank=self.rank,
             world_size=self.world_size,
@@ -192,9 +199,11 @@ def _dcp_test(self):
 
         _dcp_test(self)
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [
@@ -213,11 +222,11 @@ def _dcp_test(self):
         ],
     )
     def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam):
-        _device_raii = torch.device("cuda", self.device)
-        torch.cuda.set_device(self.device)
+        _device_raii = torch.device(device_type, self.device)
+        torch.accelerator.set_device_index(self.device)
         store = torch.distributed.FileStore(self.file_name, self.world_size)
         torch.distributed.init_process_group(
-            backend="nccl",
+            backend=backend,
             store=store,
             rank=self.rank,
             world_size=self.world_size,
@@ -228,7 +237,7 @@ def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam):
         num_microbatches = 8
         dp_size = self.world_size // (tp_size * pp_size)
         device_mesh = init_device_mesh(
-            "cuda",
+            device_type,
             mesh_shape=(dp_size, pp_size, tp_size),
             mesh_dim_names=("dp", "pp", "tp"),
         )
diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index a793fe2fed4cc..8c1cb3d5df32b 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
 import os
+import unittest
 from copy import deepcopy
 
 import torch
@@ -14,7 +15,11 @@
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU
+
+
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+device_module = torch.get_device_module(device_type)
 
 
 class Net(nn.Module):
@@ -154,6 +159,7 @@ def test_replicate_single_module(self):
         self._compare_module(model, replicate_model)
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
     def test_replicate_move_args_kwargs_to_device(self):
         class MyNet(nn.Module):
             def __init__(self) -> None:
@@ -166,24 +172,25 @@ def forward(self, inp, *, kwarg=None):
                 return self.a(inp)
 
         self._init_pg()
-        torch.cuda.set_device(self.rank)
-        model = MyNet().cuda()
-        replicate(model, device_id=torch.cuda.current_device())
+        torch.accelerator.set_device_index(self.rank)
+        model = MyNet().to(device_type)
+        replicate(model, device_id=torch.accelerator.current_device_index())
         # CPU input ensures replicate can move arg and kwargs to device.
         a, b = torch.randn(2, 2), torch.randn(2, 2)
         model(a, kwarg=b).sum().backward()
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
     def test_replicate_ignore_module(self):
         self._init_pg()
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         # Seed ensures diff input and thus different local grads across ranks.
         torch.manual_seed(self.rank)
-        torch.cuda.manual_seed(self.rank)
-        model = Net().cuda()
+        device_module.manual_seed(self.rank)
+        model = Net().to(device_type)
         replicate(model, ignored_modules=[model.fc1])
         # CPU input ensures that replicate can move input to GPU as DDP does.
-        inp = torch.randn(5, 2, device="cuda") * (self.rank + 1)
+        inp = torch.randn(5, 2, device=device_type) * (self.rank + 1)
         out = model(inp) * 10
         out.sum().backward()
         # FC1 grads should not be synchronized, FC2 and 3 should be.
@@ -221,10 +228,11 @@ def test_replicate_with_kwargs(self):
         self._compare_module(model, replicate_model)
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
     def test_replicate_device_id(self):
         self._init_pg()
         model = Net()
-        model_cuda = deepcopy(model).cuda()
+        model_cuda = deepcopy(model).to(device_type)
         model_cuda2 = deepcopy(model_cuda)
         replicate(model, device_id=torch.device("cpu"))
         # DDP instance is attached in first pre forward
@@ -233,13 +241,15 @@ def test_replicate_device_id(self):
         # Should be None for CPU training
         self.assertEqual(None, replicate_ddp_weakref.device_ids)
 
-        replicate(model_cuda, device_id=torch.device(torch.cuda.current_device()))
+        replicate(
+            model_cuda, device_id=torch.device(torch.accelerator.current_device_index())
+        )
         # DDP instance is attached in first pre forward
         model_cuda(torch.randn(2, 2))
         replicate_ddp_weakref = replicate.state(model_cuda)._ddp_weakref()
         self.assertEqual([0], replicate_ddp_weakref.device_ids)
         # Pass in int as device_id
-        replicate(model_cuda2, device_id=int(torch.cuda.current_device()))
+        replicate(model_cuda2, device_id=int(torch.accelerator.current_device_index()))
         # DDP instance is attached in first pre forward
         model_cuda2(torch.randn(2, 2))
         replicate_ddp_weakref = replicate.state(model_cuda2)._ddp_weakref()
@@ -256,6 +266,7 @@ def test_replicate_wrong_device_id_type(self):
 
 class ReplicateFullyShardInit(ReplicateTest):
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
     def test_replicate_fully_shard_init(self):
         class ToyModel(nn.Module):
             def __init__(self, dim: int):
@@ -273,14 +284,14 @@ def forward(self, x: torch.Tensor):
                 return y
 
         self._init_pg()
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         dim = 3
         bz = 2
-        model = ToyModel(dim).cuda()
+        model = ToyModel(dim).to(device_type)
         for linear in model.linears:
             fully_shard(linear)
         fully_shard(model.linears)
-        replicate(model, device_id=torch.cuda.current_device())
+        replicate(model, device_id=torch.accelerator.current_device_index())
         for linear in model.linears:
             self.assertTrue(isinstance(linear.weight, DTensor))
         inp = torch.rand(bz, dim)
diff --git a/test/distributed/_composable/test_replicate_training.py b/test/distributed/_composable/test_replicate_training.py
new file mode 100644
index 0000000000000..9aac43404d69a
--- /dev/null
+++ b/test/distributed/_composable/test_replicate_training.py
@@ -0,0 +1,656 @@
+# Owner(s): ["oncall: distributed"]
+
+import contextlib
+import copy
+import functools
+import itertools
+import unittest
+from collections.abc import Iterable
+from typing import Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed._composable.replicate_with_fsdp import replicate
+from torch.distributed.fsdp import CPUOffloadPolicy, FSDPModule, OffloadPolicy
+from torch.distributed.tensor import DTensor, init_device_mesh
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    compiled_fsdp_test,
+    FSDPTest,
+    FSDPTestMultiThread,
+    MLP,
+    patch_all_gather,
+    patch_reduce_scatter,
+)
+from torch.testing._internal.common_utils import (
+    get_cycles_per_ms,
+    run_tests,
+    TEST_HPU,
+    wrapSwapTensorsTest,
+)
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+
+
+c10d_ops = torch.ops.c10d
+funcol = torch.ops.c10d_functional
+
+from torch.testing._internal.common_fsdp import get_devtype
+
+
+device_type = torch.device(get_devtype())
+
+
+class TestReplicateForwardInputs(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(1)
+    def test_root_move_forward_input_to_device(self):
+        device = torch.device(device_type.type, 0)
+
+        class ParamlessModule(nn.Module):
+            def forward(self, x: torch.Tensor, ys: tuple[torch.Tensor, ...]):
+                # Check that Replicate moved the inputs to GPU, including recursing
+                # into the tuple data structure
+                assert x.device == device, f"Expects {device} but got {x.device}"
+                assert ys[0].device == device, (
+                    f"Expects {device} but got {ys[0].device}"
+                )
+                assert ys[1].device == device, (
+                    f"Expects {device} but got {ys[1].device}"
+                )
+                y = ys[0] + ys[1]
+                return x + y + 1
+
+        model = ParamlessModule().to(device)
+        replicate(model).to(device)
+        x = torch.randn((3,))
+        ys = (torch.randn((3,)), torch.randn((3,)))
+        self.assertEqual(x.device, torch.device("cpu"))
+        self.assertEqual(ys[0].device, torch.device("cpu"))
+        self.assertEqual(ys[1].device, torch.device("cpu"))
+        model(x, ys)
+
+
+class TestReplicateRegisteredParams(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @skip_if_lt_x_gpu(1)
+    def test_param_registration_after_forward(self):
+        """Tests the parameter registration after forward."""
+        device = torch.device(device_type.type, 0)
+        # Single Replicate group
+        for reshard_after_forward in (True, False, None):
+            torch.manual_seed(42)
+            model = MLP(3, device)
+            # Since seed is per process, not per thread, we broadcast to ensure
+            # the same parameters across ranks
+            for param in model.parameters():
+                dist.broadcast(param, src=0)
+            ref_model = copy.deepcopy(model)
+            replicate(model, reshard_after_forward=reshard_after_forward)  # root only
+            inp = torch.randn((2, 3), device=device_type.type)
+            self._assert_dtensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+            model(inp)
+            if reshard_after_forward:
+                self._assert_dtensor_params(model.parameters())
+            else:
+                self._assert_tensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+            model.reshard()  # however, we can manually reshard
+            self._assert_dtensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+
+        # Multiple Replicate groups
+        for reshard_after_forward in (True, False, None):
+            torch.manual_seed(42)
+            model = nn.Sequential(MLP(3, device), MLP(3, device))
+            for param in model.parameters():
+                dist.broadcast(param, src=0)
+            ref_model = copy.deepcopy(model)
+            replicate(model[0].in_proj, reshard_after_forward=reshard_after_forward)
+            replicate(model[0].out_proj, reshard_after_forward=reshard_after_forward)
+            replicate(model, reshard_after_forward=reshard_after_forward)
+
+            self._assert_dtensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+            model(inp)
+            non_root_params = list(model[0].in_proj.parameters()) + list(
+                model[0].out_proj.parameters()
+            )
+            root_params = list(set(model.parameters()) - set(non_root_params))
+            if reshard_after_forward is None:
+                self._assert_dtensor_params(non_root_params)
+                self._assert_tensor_params(root_params)
+            elif reshard_after_forward:
+                self._assert_dtensor_params(non_root_params)
+                self._assert_dtensor_params(root_params)
+            else:
+                self._assert_tensor_params(non_root_params)
+                self._assert_tensor_params(root_params)
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+            for module in model.modules():
+                if isinstance(module, FSDPModule):
+                    module.reshard()  # however, we can manually reshard
+            self._assert_dtensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+
+    @skip_if_lt_x_gpu(1)
+    def test_param_registration_after_backward(self):
+        """Tests the parameter registration after backward."""
+        device = torch.device(device_type.type, 0)
+        # Single Replicate group
+        for reshard_after_forward in (True, False):
+            model = MLP(8, device)
+            replicate(model, reshard_after_forward=reshard_after_forward)  # root only
+            inp = torch.randn((2, 8), device=device_type.type)
+            self._assert_dtensor_params(model.parameters())
+            model(inp).sum().backward()
+            self._assert_dtensor_params(model.parameters())
+
+        # Multiple Replicate groups
+        for reshard_after_forward in (True, False):
+            model = MLP(8, device)
+            replicate(model.in_proj, reshard_after_forward=reshard_after_forward)
+            replicate(model.out_proj, reshard_after_forward=reshard_after_forward)
+            replicate(model, reshard_after_forward=reshard_after_forward)
+            self._assert_dtensor_params(model.parameters())
+            model(inp).sum().backward()
+            self._assert_dtensor_params(model.parameters())
+
+    def _assert_tensor_params(self, params: Iterable[nn.Parameter]):
+        # need to iterate over the list multiple times
+        params = list(params)
+        self.assertGreater(len(params), 0)
+        for param in params:
+            self.assertNotIsInstance(param, DTensor)
+            self.assertIsInstance(param, torch.Tensor)
+
+    def _assert_dtensor_params(self, params: Iterable[nn.Parameter]):
+        params = list(params)
+        self.assertGreater(len(params), 0)
+        for param in params:
+            self.assertIsInstance(param, DTensor)
+
+    def _assert_same_params(
+        self, params: Iterable[nn.Parameter], ref_params: Iterable[nn.Parameter]
+    ):
+        params, ref_params = list(params), list(ref_params)
+        self.assertEqual(len(params), len(ref_params))
+        for param, ref_param in zip(params, ref_params):
+            if isinstance(param, DTensor):
+                param = param.full_tensor()
+            self.assertEqual(param.shape, ref_param.shape)
+            self.assertEqual(param, ref_param)
+
+
+class TestReplicateCastAfterInit(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(1)
+    @wrapSwapTensorsTest(True)
+    def test_to_float64_after_init(self):
+        """Tests that the user can cast the module to float64 after init."""
+        # NOTE: Test fp64 instead of a lower precision dtype like bf16 for
+        # better numerics. The important part is changing the dtype.
+
+        torch.manual_seed(42)
+        mlp_dim, device, dtype = 4, device_type, torch.float64
+        model = MLP(mlp_dim, device=device)
+        for param in model.parameters():
+            dist.broadcast(param, src=0)
+        ref_model = copy.deepcopy(model).to(dtype)
+
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for module in (model.in_proj, model.out_proj, model):
+            replicate(module)
+        model.to(dtype)
+        for param in model.parameters():
+            self.assertEqual(param.dtype, dtype)
+            self.assertEqual(param.to_local().dtype, dtype)
+            self.assertEqual(param._spec.tensor_meta.dtype, dtype)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        check_sharded_parity(self, ref_model, model)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, mlp_dim), device=device_type.type, dtype=dtype)
+        for iter_idx in range(10):
+            losses: list[torch.Tensor] = []
+            for _model in (ref_model, model):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad)
+                    param.grad.div_(self.world_size)
+
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for param in model.parameters():
+                self.assertEqual(param.dtype, dtype)
+                self.assertEqual(param.to_local().dtype, dtype)
+                self.assertEqual(param._spec.tensor_meta.dtype, dtype)
+                self.assertEqual(param.grad.dtype, dtype)
+                self.assertEqual(param.grad.to_local().dtype, dtype)
+                self.assertEqual(param.grad._spec.tensor_meta.dtype, dtype)
+            for _optim in (ref_optim, optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+
+
+class TestReplicate1DTrainingCore(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(8, torch.get_device_module(device_type).device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_parity_single_group(self):
+        """
+        Tests train parity with DDP for a single FSDP group when sharding
+        parameters on dim-0.
+        """
+        self.run_subtests(
+            {
+                "lin_shapes": [
+                    [(16, 15), (15, 8)],
+                    [(7, 15), (15, 3)],
+                    [(16, 17), (17, 8)],
+                ],
+                "use_shard_placement_fn": [False],
+            },
+            self._test_train_parity_single_group,
+        )
+
+    def _test_train_parity_single_group(
+        self, lin_shapes: list[tuple[int, int]], use_shard_placement_fn: bool
+    ):
+        torch.manual_seed(42)
+        model = nn.Sequential(
+            nn.Linear(*lin_shapes[0]), nn.ReLU(), nn.Linear(*lin_shapes[1])
+        )
+        ref_model = copy.deepcopy(model).to(device_type)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+
+        replicate(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = (torch.randn((4, lin_shapes[0][0]), device=device_type.type),)
+        for iter_idx in range(10):
+            losses: list[torch.Tensor] = []
+            for _model in (ref_model, model):
+                losses.append(_model(*inp).sum())
+                losses[-1].backward()
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad)
+                    param.grad.div_(self.world_size)
+
+            for _optim in (ref_optim, optim):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                _optim.step()
+            self.assertEqual(losses[0], losses[1])
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
+    @compiled_fsdp_test(compile_compute_on_module=Transformer)
+    def test_train_parity_multi_groups(self):
+        """
+        Tests train parity against DDP when using multiple parameter groups for
+        communication (for communication and computation overlap plus memory
+        reduction).
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False],
+                "test_device_type": [device_type.type],
+                "offload_policy": [OffloadPolicy()],
+                "delay_after_forward": [False, True],
+                "delay_before_all_gather": [False, True],
+                "delay_before_reduce_scatter": [False, True],
+                "delay_before_optim": [False, True],
+                "unshard_async_op": [False],
+            },
+            self._test_train_parity_multi_group,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
+    def test_train_parity_multi_group_cpu_offload_eager(self):
+        """
+        Tests train parity when using multiple parameter groups for
+        communication and CPU offloading.
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True],  # save CI time
+                "offload_policy": [
+                    CPUOffloadPolicy(pin_memory=True),
+                    CPUOffloadPolicy(pin_memory=False),
+                ],
+                "test_device_type": [device_type.type],
+                "delay_after_forward": [False, True],
+                "delay_before_all_gather": [False, True],
+                "delay_before_reduce_scatter": [False, True],
+                "delay_before_optim": [False, True],
+                "unshard_async_op": [False],
+            },
+            self._test_train_parity_multi_group,
+        )
+
+    def _test_train_parity_multi_group(
+        self,
+        reshard_after_forward: Union[bool, int],
+        offload_policy: OffloadPolicy,
+        test_device_type: str,
+        delay_after_forward: bool,
+        delay_before_all_gather: bool,
+        delay_before_reduce_scatter: bool,
+        delay_before_optim: bool,
+        unshard_async_op: bool,
+    ):
+        # Only test individual delays or all four delays to save test time
+        if (
+            delay_after_forward
+            + delay_before_all_gather
+            + delay_before_reduce_scatter
+            + delay_before_optim
+            in (2, 3)
+        ):
+            return
+        assert test_device_type in ("cuda", "hpu", "xpu", "cpu"), f"{test_device_type}"
+        torch.manual_seed(42)
+        vocab_size = 1024
+        model_args = ModelArgs(
+            n_layers=3,
+            n_heads=4,
+            vocab_size=vocab_size,
+            max_seq_len=64,
+            dropout_p=0,
+        )
+        model = Transformer(model_args)
+        ref_model = copy.deepcopy(model).to(device_type)
+
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        mesh = init_device_mesh(
+            test_device_type,
+            (self.world_size, 1),
+            mesh_dim_names=("replicate", "shard"),
+        )
+        fully_shard_fn = functools.partial(
+            replicate,
+            device_mesh=mesh,
+            reshard_after_forward=reshard_after_forward,
+            offload_policy=offload_policy,
+        )
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard_fn(module)
+        fully_shard_fn(model)
+        if unshard_async_op:
+            model._set_unshard_async_op(unshard_async_op)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        delay_in_ms = 100
+        orig_all_gather = dist.all_gather_into_tensor
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def delayed_all_gather(*args, **kwargs):
+            torch.get_device_module(device_type)._sleep(
+                int(delay_in_ms * get_cycles_per_ms())
+            )
+            return orig_all_gather(*args, **kwargs)
+
+        def delayed_reduce_scatter(*args, **kwargs):
+            torch.get_device_module(device_type)._sleep(
+                int(delay_in_ms * get_cycles_per_ms())
+            )
+            return orig_reduce_scatter(*args, **kwargs)
+
+        torch.manual_seed(42 + self.rank + 1)
+        patch_all_gather_ctx = (
+            patch_all_gather(delayed_all_gather)
+            if delay_before_all_gather
+            else contextlib.nullcontext()
+        )
+        patch_reduce_scatter_ctx = (
+            patch_reduce_scatter(delayed_reduce_scatter)
+            if delay_before_reduce_scatter
+            else contextlib.nullcontext()
+        )
+        with patch_all_gather_ctx, patch_reduce_scatter_ctx:
+            for iter_idx in range(10):
+                inp = torch.randint(0, vocab_size, (3, 64), device=device_type)
+                losses: list[torch.Tensor] = []
+                for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                    losses.append(_model(inp).sum())
+                    if _model is model and delay_after_forward:
+                        torch.get_device_module(device_type)._sleep(
+                            int(delay_in_ms * get_cycles_per_ms())
+                        )
+                    losses[-1].backward()
+                    if _model is model and delay_before_optim:
+                        torch.get_device_module(device_type)._sleep(
+                            int(delay_in_ms * get_cycles_per_ms())
+                        )
+
+                for param in ref_model.parameters():
+                    if param.grad is not None:
+                        dist.all_reduce(param.grad)
+                        param.grad.div_(self.world_size)
+
+                for _optim in (ref_optim, optim):
+                    _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                    _optim.step()
+                self.assertEqual(losses[0], losses[1])
+
+    @skip_if_lt_x_gpu(2)
+    def test_non_root_forward_backward(self):
+        """
+        Tests running forward/backward through the root and then through a
+        non-root. The non-root needs to synchronize streams/queue the callback.
+        """
+        torch.manual_seed(42)
+        lin_dim = 32
+        model = nn.Sequential(*[MLP(lin_dim, torch.device("cpu")) for _ in range(3)])
+        ref_model = copy.deepcopy(model).to(device_type)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for mlp in model:
+            replicate(mlp)
+        replicate(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randn((8, lin_dim), device=device_type)
+
+        ref_root_loss = ref_model(inp).sum()
+        ref_root_loss.backward()
+        for param in ref_model.parameters():
+            dist.all_reduce(param.grad)
+            param.grad.detach().div_(self.world_size)
+        ref_optim.step()
+        ref_optim.zero_grad()
+        ref_nonroot_loss = ref_model[0](inp).sum()
+        ref_nonroot_loss.backward()
+        for param in ref_model.parameters():
+            if param.grad is not None:
+                dist.all_reduce(param.grad)
+                param.grad.detach().div_(self.world_size)
+        ref_optim.step()
+
+        root_loss = model(inp).sum()
+        root_loss.backward()
+        torch.get_device_module(device_type)._sleep(int(100 * get_cycles_per_ms()))
+        optim.step()
+        optim.zero_grad()
+        nonroot_loss = model[0](inp).sum()
+        nonroot_loss.backward()
+        optim.step()
+
+        self.assertEqual(ref_root_loss, root_loss)
+        self.assertEqual(ref_nonroot_loss, nonroot_loss)
+        self.assertEqual(ref_model(inp).sum(), model(inp).sum())
+
+    @skip_if_lt_x_gpu(2)
+    def test_multi_forward_module(self):
+        """
+        Tests parity when running a module that participates multiple
+        times in forward.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False]},
+            self._test_multi_forward_module,
+        )
+
+    def _test_multi_forward_module(self, reshard_after_forward: Union[bool, int]):
+        class MultiForwardModule(nn.Module):
+            def __init__(self, device: torch.device):
+                super().__init__()
+                self.inner = nn.Linear(4, 4, device=device)
+                self.outer = nn.Linear(4, 5, device=device)
+
+            def forward(self, x):
+                i = self.inner(x)
+                j = self.inner(x)
+                return self.outer(i + j)
+
+        torch.manual_seed(42)
+        model = MultiForwardModule(device=device_type.type)
+        ref_model = copy.deepcopy(model).to(device_type)
+
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        replicate(model.inner)
+        replicate(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randn((32, 4), device=device_type.type)
+        for iter_idx in range(10):
+            losses: list[torch.Tensor] = []
+            for _model in (ref_model, model):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad)
+                    param.grad.div_(self.world_size)
+
+            for _optim in (ref_optim, optim):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                _optim.step()
+
+            self.assertEqual(losses[0], losses[1])
+
+    @skip_if_lt_x_gpu(2)
+    def test_explicit_prefetching(self):
+        torch.manual_seed(42)
+        model_args = ModelArgs(n_layers=8, dropout_p=0.0)
+        model = Transformer(model_args)
+        ref_model = copy.deepcopy(model).to(device_type)
+        ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
+
+        for layer in itertools.chain(model.layers, [model]):
+            replicate(layer)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
+
+        num_to_forward_prefetch = num_to_backward_prefetch = 2
+        for i, layer in enumerate(model.layers):
+            if i >= len(model.layers) - num_to_forward_prefetch:
+                break
+            layers_to_prefetch = [
+                model.layers[i + j] for j in range(1, num_to_forward_prefetch + 1)
+            ]
+            layer.set_modules_to_forward_prefetch(layers_to_prefetch)
+        for i, layer in enumerate(model.layers):
+            if i < num_to_backward_prefetch:
+                continue
+            layers_to_prefetch = [
+                model.layers[i - j] for j in range(1, num_to_backward_prefetch + 1)
+            ]
+            layer.set_modules_to_backward_prefetch(layers_to_prefetch)
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
+        for _ in range(10):
+            losses: list[torch.Tensor] = []
+
+            for _model in (ref_model, model):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad)
+                    param.grad.div_(self.world_size)
+
+            for _optim in (ref_optim, optim):
+                _optim.zero_grad()
+                _optim.step()
+
+            self.assertEqual(losses[0], losses[1])
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
+    def test_post_optim_event(self):
+        torch.manual_seed(42)
+        model_args = ModelArgs(dropout_p=0.0)
+        model = Transformer(model_args)
+        ref_model = copy.deepcopy(model).to(device_type.type)
+        ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
+        for layer in itertools.chain(model.layers, [model]):
+            replicate(layer)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
+
+        def step_post_hook(
+            fsdp_module: FSDPModule, opt: torch.optim.Optimizer, args, kwargs
+        ) -> None:
+            post_optim_event = (
+                torch.get_device_module(device_type).current_stream().record_event()
+            )
+            fsdp_module.set_post_optim_event(post_optim_event)
+
+        optim.register_step_post_hook(functools.partial(step_post_hook, model))
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
+        # Track all losses and check for equality at the end to avoid a CPU
+        # sync point after each iteration
+        ref_losses: list[torch.Tensor] = []
+        losses: list[torch.Tensor] = []
+        for _ in range(10):
+            ref_optim.zero_grad()
+            ref_losses.append(ref_model(inp).sum())
+            ref_losses[-1].backward()
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad)
+                    param.grad.div_(self.world_size)
+
+            ref_optim.step()
+        for _ in range(10):
+            optim.zero_grad()
+            losses.append(model(inp).sum())
+            losses[-1].backward()
+            optim.step()
+            # Sleep after the optimizer step to allow CPU to run ahead into the
+            # next iteration's forward, exercising the post-optim stream sync
+            torch.get_device_module(device_type)._sleep(int(25 * get_cycles_per_ms()))
+        for ref_loss, loss in zip(ref_losses, losses):
+            self.assertEqual(ref_loss, loss)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/test_replicate_with_compiler.py b/test/distributed/_composable/test_replicate_with_compiler.py
index 11eba3e5bb0c9..291b3a4268223 100644
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@@ -98,6 +98,8 @@ def _test_compile(
         self.create_pg(device)
         torch._dynamo.config.optimize_ddp = "python_reducer"
         torch.manual_seed(123)
+        if device_type == "xpu":
+            torch.use_deterministic_algorithms(True, warn_only=True)
         model = Net(checkpoint=checkpoint).to(device)
         input = torch.randn([1, DIM], device=device)
 
diff --git a/test/distributed/_pycute/test_coalesce.py b/test/distributed/_pycute/test_coalesce.py
new file mode 100644
index 0000000000000..1a48bb5b3364e
--- /dev/null
+++ b/test/distributed/_pycute/test_coalesce.py
@@ -0,0 +1,112 @@
+# ruff: noqa: PGH004, G004, F403
+# flake8: noqa
+# Owner(s): ["oncall: distributed"]
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for _pycute.coalesce
+"""
+
+import logging
+
+from torch.distributed._pycute import *
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestCoalesce(TestCase):
+    def helper_test_coalesce(self, layout, coalesced_layout=None):
+        layoutR = coalesce(layout)
+
+        _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+        if coalesced_layout:
+            self.assertEqual(coalesced_layout.shape, layoutR.shape)
+            self.assertEqual(coalesced_layout.stride, layoutR.stride)
+        self.assertEqual(size(layoutR), size(layout))
+
+        for i in range(size(layout)):
+            self.assertEqual(layoutR(i), layout(i))
+
+    def test_coalesce(self):
+        layout = Layout(1, 0)
+        self.helper_test_coalesce(layout)
+
+        layout = Layout(1, 1)
+        self.helper_test_coalesce(layout)
+
+        layout = Layout((2, 4))
+        self.helper_test_coalesce(layout)
+
+        layout = Layout((2, 4, 6))
+        self.helper_test_coalesce(layout)
+
+        layout = Layout((2, 4, 6), (1, 6, 2))
+        self.helper_test_coalesce(layout)
+
+        layout = Layout((2, 1, 6), (1, 7, 2))
+        self.helper_test_coalesce(layout)
+
+        layout = Layout((2, 1, 6), (4, 7, 8))
+        self.helper_test_coalesce(layout)
+
+        layout = Layout((2, (4, 6)))
+        self.helper_test_coalesce(layout)
+
+        layout = Layout((1, 2), (8, 1))
+        coalesced_layout = Layout(2, 1)
+        self.helper_test_coalesce(layout, coalesced_layout)
+
+        layout = Layout((2, 4), (4, 1))
+        coalesced_layout = Layout(8, 1)
+        self.helper_test_coalesce(layout, coalesced_layout)
+
+        layout = Layout((2, 4, 6), (24, 6, 1))
+        coalesced_layout = Layout(48, 1)
+        self.helper_test_coalesce(layout, coalesced_layout)
+
+        layout = Layout((2, 1, 3), (2, 4, 4))
+        self.helper_test_coalesce(layout)
+
+        layout = Layout(((2, 2), (2, 2)), ((1, 4), (8, 32)))
+        self.helper_test_coalesce(layout)
+
+        layout = Layout(((2, 2), (2, 2)), ((32, 8), (4, 1)))
+        coalesced_layout = Layout((2, 4, 2), (32, 4, 1))
+        self.helper_test_coalesce(layout, coalesced_layout)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_pycute/test_complement.py b/test/distributed/_pycute/test_complement.py
new file mode 100644
index 0000000000000..fd6413bcd112e
--- /dev/null
+++ b/test/distributed/_pycute/test_complement.py
@@ -0,0 +1,96 @@
+# ruff: noqa: PGH004, G004, F403
+# flake8: noqa
+# Owner(s): ["oncall: distributed"]
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for _pycute.complement
+"""
+
+import logging
+
+from torch.distributed._pycute import *
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComplement(TestCase):
+    def helper_test_complement(self, layout):
+        layoutR = complement(layout)
+
+        _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+        # Post-condition: test disjointness of the codomains
+        for a in range(size(layout)):
+            for b in range(size(layoutR)):
+                assert (layout(a) != layoutR(b)) or (layout(a) == 0 and layoutR(b) == 0)
+
+    def test_complement(self):
+        test = Layout(1, 0)
+        self.helper_test_complement(test)
+
+        test = Layout(1, 1)
+        self.helper_test_complement(test)
+
+        test = Layout(4, 0)
+        self.helper_test_complement(test)
+
+        test = Layout((2, 4), (1, 2))
+        self.helper_test_complement(test)
+
+        test = Layout((2, 3), (1, 2))
+        self.helper_test_complement(test)
+
+        test = Layout((2, 4), (1, 4))
+        self.helper_test_complement(test)
+
+        test = Layout((2, 4, 8), (8, 1, 64))
+        self.helper_test_complement(test)
+
+        test = Layout(((2, 2), (2, 2)), ((1, 4), (8, 32)))
+        self.helper_test_complement(test)
+
+        test = Layout((2, (3, 4)), (3, (1, 6)))
+        self.helper_test_complement(test)
+
+        test = Layout((4, 6), (1, 6))
+        self.helper_test_complement(test)
+
+        test = Layout((4, 10), (1, 10))
+        self.helper_test_complement(test)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_pycute/test_composition.py b/test/distributed/_pycute/test_composition.py
new file mode 100644
index 0000000000000..467b13933ffdc
--- /dev/null
+++ b/test/distributed/_pycute/test_composition.py
@@ -0,0 +1,233 @@
+# ruff: noqa: PGH004, G004, F403
+# flake8: noqa
+# Owner(s): ["oncall: distributed"]
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for _pycute.composition
+"""
+
+import logging
+
+from torch.distributed._pycute import *
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComposition(TestCase):
+    def helper_test_composition(self, layoutA, layoutB):
+        layoutR = composition(layoutA, layoutB)
+
+        _LOGGER.debug(f"{layoutA} o {layoutB}  =>  {layoutR}")
+
+        # True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
+
+        # Test that R(c) = A(B(c)) for all coordinates c in layoutR
+        for i in range(size(layoutR)):
+            self.assertEqual(layoutR(i), layoutA(layoutB(i)))
+
+    def test_composition(self):
+        layoutA = Layout(1, 0)
+        layoutB = Layout(1, 0)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(1, 0)
+        layoutB = Layout(1, 1)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(1, 1)
+        layoutB = Layout(1, 0)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(1, 1)
+        layoutB = Layout(1, 1)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(4)
+        layoutB = Layout(4)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4), (2))
+        layoutB = Layout(4)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(4)
+        layoutB = Layout((4), (2))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4), (0))
+        layoutB = Layout(4)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(4)
+        layoutB = Layout((4), (0))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((1), (0))
+        layoutB = Layout(4)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(4)
+        layoutB = Layout((1), (0))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(4)
+        layoutB = Layout(2)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4), (2))
+        layoutB = Layout(2)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(4)
+        layoutB = Layout((2), (2))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4), (2))
+        layoutB = Layout((2), (2))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(12)
+        layoutB = Layout((4, 3))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((12), (2))
+        layoutB = Layout((4, 3))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(12)
+        layoutB = Layout((4, 3), (3, 1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((12), (2))
+        layoutB = Layout((4, 3), (3, 1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(12)
+        layoutB = Layout((2, 3), (2, 4))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 3))
+        layoutB = Layout((4, 3))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 3))
+        layoutB = Layout(12)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 3))
+        layoutB = Layout((6), (2))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 3))
+        layoutB = Layout((6, 2), (2, 1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 3), (3, 1))
+        layoutB = Layout((4, 3))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 3), (3, 1))
+        layoutB = Layout(12)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 3), (3, 1))
+        layoutB = Layout((6), (2))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 3), (3, 1))
+        layoutB = Layout((6, 2), (2, 1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((8, 8))
+        layoutB = Layout(((2, 2, 2), (2, 2, 2)), ((1, 16, 4), (8, 2, 32)))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((8, 8), (8, 1))
+        layoutB = Layout(((2, 2, 2), (2, 2, 2)), ((1, 16, 4), (8, 2, 32)))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(((2, 2, 2), (2, 2, 2)), ((1, 16, 4), (8, 2, 32)))
+        layoutB = Layout(8, 4)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout(((4, 2)), ((1, 16)))
+        layoutB = Layout((4, 2), (2, 1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((2, 2), (2, 1))
+        layoutB = Layout((2, 2), (2, 1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 8, 2))
+        layoutB = Layout((2, 2, 2), (2, 8, 1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 8, 2), (2, 8, 1))
+        layoutB = Layout((2, 2, 2), (1, 8, 2))
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4, 8, 2), (2, 8, 1))
+        layoutB = Layout((4, 2, 2), (2, 8, 1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        # Pre-coalesced LHS
+        layoutA = Layout((4, 6, 8), (1, 4, 7))
+        layoutB = Layout((6), (1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        # Pre-coalesced RHS
+        layoutA = Layout((8, 6, 4), (7, 4, 1))
+        layoutB = Layout((6), (1))
+        self.helper_test_composition(layoutA, layoutB)
+
+        # Case when not meet stride divisibility condition
+        with self.assertRaises(AssertionError):
+            layoutA = Layout((4, 6, 8, 10), (2, 3, 5, 7))
+            layoutB = Layout(6, 12)
+            self.helper_test_composition(layoutA, layoutB)
+
+        # Mid-layout truncation
+        layoutA = Layout((10, 8, 6, 4), (7, 5, 3, 2))
+        layoutB = Layout(6, 12)
+        self.helper_test_composition(layoutA, layoutB)
+
+        layoutA = Layout((4,), (3,))
+        layoutB = Layout((6,), (2,))
+        self.helper_test_composition(layoutA, layoutB)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_pycute/test_int_tuple.py b/test/distributed/_pycute/test_int_tuple.py
new file mode 100644
index 0000000000000..27cebf30bd57e
--- /dev/null
+++ b/test/distributed/_pycute/test_int_tuple.py
@@ -0,0 +1,225 @@
+# ruff: noqa: PGH004, G004, F403
+# flake8: noqa
+# Owner(s): ["oncall: distributed"]
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for _pycute.int_tuple
+"""
+
+from torch.distributed._pycute import *
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestIntTuple(TestCase):
+    def test_product(self):
+        self.assertEqual(product(2), 2)
+
+        self.assertEqual(product((3, 2)), 6)
+
+        self.assertEqual(product(product(((2, 3), 4))), 24)
+
+    def test_inner_product(self):
+        self.assertEqual(inner_product(2, 3), 6)
+
+        self.assertEqual(inner_product((1, 2), (3, 2)), 7)
+
+        self.assertEqual(inner_product(((2, 3), 4), ((2, 1), 2)), 15)
+
+    def test_shape_div(self):
+        self.assertEqual(shape_div((3, 4), 6), (1, 2))
+
+        self.assertEqual(shape_div((3, 4), 12), (1, 1))
+
+        self.assertEqual(shape_div((3, 4), 36), (1, 1))
+
+        self.assertEqual(shape_div(((3, 4), 6), 36), ((1, 1), 2))
+
+        self.assertEqual(shape_div((6, (3, 4)), 36), (1, (1, 2)))
+
+    def test_suffix_product(self):
+        self.assertEqual(suffix_product(2), 1)
+
+        self.assertEqual(suffix_product((3, 2)), (2, 1))
+
+        self.assertEqual(suffix_product((3, 2, 4)), (8, 4, 1))
+
+        self.assertEqual(suffix_product(((2, 3), 4)), ((12, 4), 1))
+
+        self.assertEqual(
+            suffix_product(((2, 3), (2, 1, 2), (5, 2, 1))),
+            ((120, 40), (20, 20, 10), (2, 1, 1)),
+        )
+
+    def test_crd2idx_basic(self):
+        # Test basic int/int case
+        self.assertEqual(crd2idx(2, 5, 1), 2)
+        self.assertEqual(crd2idx(0, 5, 1), 0)
+        self.assertEqual(crd2idx(4, 5, 1), 4)
+
+        # Test with custom stride
+        self.assertEqual(crd2idx(2, 5, 3), 6)
+        self.assertEqual(crd2idx(1, 5, 3), 3)
+
+    def test_crd2idx_tuple(self):
+        # Test tuple coordinates with default stride
+        self.assertEqual(crd2idx((1, 2), (3, 4)), 6)  # 1*4 + 2*1 = 6
+        self.assertEqual(crd2idx((0, 0), (3, 4)), 0)
+        self.assertEqual(crd2idx((2, 3), (3, 4)), 11)  # 2*4 + 3*1 = 11
+
+        # Test with custom stride
+        self.assertEqual(crd2idx((1, 2), (3, 4), (8, 2)), 12)  # 1*8 + 2*2 = 12
+
+        # Test 3D case
+        self.assertEqual(crd2idx((1, 0, 2), (2, 3, 4)), 14)  # 1*12 + 0*4 + 2*1 = 14
+
+    def test_crd2idx_none(self):
+        # Test None coordinate (should default to 0)
+        self.assertEqual(crd2idx(None, 5), 0)
+        self.assertEqual(crd2idx(None, (3, 4)), 0)
+
+    def test_crd2idx_int_with_tuple_shape(self):
+        # Test single integer coordinate with multi-dimensional shape and stride
+        # When crd is int and shape is tuple, it converts the int to multi-dim coordinate first
+        self.assertEqual(crd2idx(0, (2, 2), (2, 1)), 0)  # 0 -> (0,0) -> 0*2 + 0*1 = 0
+        self.assertEqual(crd2idx(1, (2, 2), (2, 1)), 1)  # 1 -> (0,1) -> 0*2 + 1*1 = 1
+        self.assertEqual(crd2idx(2, (2, 2), (2, 1)), 2)  # 2 -> (1,0) -> 1*2 + 0*1 = 2
+        self.assertEqual(crd2idx(3, (2, 2), (2, 1)), 3)  # 3 -> (1,1) -> 1*2 + 1*1 = 3
+
+        # Test with non-trivial strides
+        self.assertEqual(crd2idx(0, (2, 3), (6, 2)), 0)  # 0 -> (0,0) -> 0*6 + 0*2 = 0
+        self.assertEqual(crd2idx(1, (2, 3), (6, 2)), 2)  # 1 -> (0,1) -> 0*6 + 1*2 = 2
+        self.assertEqual(crd2idx(2, (2, 3), (6, 2)), 4)  # 2 -> (0,2) -> 0*6 + 2*2 = 4
+        self.assertEqual(crd2idx(3, (2, 3), (6, 2)), 6)  # 3 -> (1,0) -> 1*6 + 0*2 = 6
+        self.assertEqual(crd2idx(4, (2, 3), (6, 2)), 8)  # 4 -> (1,1) -> 1*6 + 1*2 = 8
+        self.assertEqual(crd2idx(5, (2, 3), (6, 2)), 10)  # 5 -> (1,2) -> 1*6 + 2*2 = 10
+
+        # Test with larger strides
+        self.assertEqual(crd2idx(0, (3, 2), (10, 5)), 0)  # 0 -> (0,0) -> 0*10 + 0*5 = 0
+        self.assertEqual(crd2idx(1, (3, 2), (10, 5)), 5)  # 1 -> (0,1) -> 0*10 + 1*5 = 5
+        self.assertEqual(
+            crd2idx(2, (3, 2), (10, 5)), 10
+        )  # 2 -> (1,0) -> 1*10 + 0*5 = 10
+        self.assertEqual(
+            crd2idx(3, (3, 2), (10, 5)), 15
+        )  # 3 -> (1,1) -> 1*10 + 1*5 = 15
+        self.assertEqual(
+            crd2idx(4, (3, 2), (10, 5)), 20
+        )  # 4 -> (2,0) -> 2*10 + 0*5 = 20
+        self.assertEqual(
+            crd2idx(5, (3, 2), (10, 5)), 25
+        )  # 5 -> (2,1) -> 2*10 + 1*5 = 25
+
+        # Test with 3D shape and various strides
+        self.assertEqual(
+            crd2idx(0, (2, 2, 2), (8, 4, 2)), 0
+        )  # 0 -> (0,0,0) -> 0*8 + 0*4 + 0*2 = 0
+        self.assertEqual(
+            crd2idx(1, (2, 2, 2), (8, 4, 2)), 2
+        )  # 1 -> (0,0,1) -> 0*8 + 0*4 + 1*2 = 2
+        self.assertEqual(
+            crd2idx(2, (2, 2, 2), (8, 4, 2)), 4
+        )  # 2 -> (0,1,0) -> 0*8 + 1*4 + 0*2 = 4
+        self.assertEqual(
+            crd2idx(3, (2, 2, 2), (8, 4, 2)), 6
+        )  # 3 -> (0,1,1) -> 0*8 + 1*4 + 1*2 = 6
+        self.assertEqual(
+            crd2idx(4, (2, 2, 2), (8, 4, 2)), 8
+        )  # 4 -> (1,0,0) -> 1*8 + 0*4 + 0*2 = 8
+        self.assertEqual(
+            crd2idx(7, (2, 2, 2), (8, 4, 2)), 14
+        )  # 7 -> (1,1,1) -> 1*8 + 1*4 + 1*2 = 14
+
+        self.assertEqual(
+            crd2idx(4, ((2, 2, 2), (2, 2, 2)), ((1, 16, 4), (8, 2, 32))), 8
+        )  # 4 -> (1,0,0) -> 1*8 = 8
+
+    def test_idx2crd_basic(self):
+        # Test basic int/int case
+        self.assertEqual(idx2crd(2, 5, 1), 2)
+        self.assertEqual(idx2crd(0, 5, 1), 0)
+        self.assertEqual(idx2crd(4, 5, 1), 4)
+
+        # Test with custom stride
+        self.assertEqual(idx2crd(6, 5, 3), 2)  # (6 // 3) % 5 = 2
+        self.assertEqual(idx2crd(3, 5, 3), 1)  # (3 // 3) % 5 = 1
+
+    def test_idx2crd_tuple(self):
+        # Test tuple shape with default stride
+        self.assertEqual(idx2crd(6, (3, 4)), (1, 2))  # 6 -> (1, 2)
+        self.assertEqual(idx2crd(0, (3, 4)), (0, 0))
+        self.assertEqual(idx2crd(11, (3, 4)), (2, 3))
+
+        # Test 3D case
+        self.assertEqual(idx2crd(14, (2, 3, 4)), (1, 0, 2))
+
+    def test_crd2idx_idx2crd_roundtrip(self):
+        # Test that crd2idx and idx2crd are inverse operations
+        shapes = [
+            5,
+            (3, 4),
+            (2, 3, 4),
+            (2, 2, 2, 2),
+        ]
+
+        for shape in shapes:
+            size = product(shape)
+            for idx in range(size):
+                crd = idx2crd(idx, shape)
+                recovered_idx = crd2idx(crd, shape)
+                self.assertEqual(
+                    recovered_idx, idx, f"Failed roundtrip for shape {shape}, idx {idx}"
+                )
+
+    def test_idx2crd_crd2idx_roundtrip(self):
+        # Test roundtrip starting from coordinates
+        test_cases = [
+            (0, 5),
+            (4, 5),
+            ((0, 0), (3, 4)),
+            ((1, 2), (3, 4)),
+            ((2, 3), (3, 4)),
+            ((0, 0, 0), (2, 3, 4)),
+            ((1, 2, 3), (2, 3, 4)),
+        ]
+
+        for crd, shape in test_cases:
+            idx = crd2idx(crd, shape)
+            recovered_crd = idx2crd(idx, shape)
+            self.assertEqual(
+                recovered_crd, crd, f"Failed roundtrip for crd {crd}, shape {shape}"
+            )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_pycute/test_left_inverse.py b/test/distributed/_pycute/test_left_inverse.py
new file mode 100644
index 0000000000000..a02e3b29938b5
--- /dev/null
+++ b/test/distributed/_pycute/test_left_inverse.py
@@ -0,0 +1,91 @@
+# ruff: noqa: PGH004, G004, F403
+# flake8: noqa
+# Owner(s): ["oncall: distributed"]
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for _pycute.left_inverse
+"""
+
+import logging
+
+from torch.distributed._pycute import *
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestLeftInverse(TestCase):
+    def helper_test_left_inverse(self, layout):
+        inv_layout = left_inverse(layout)
+
+        _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+        for i in range(size(layout)):
+            self.assertEqual(inv_layout(layout(i)), i)
+
+    def test_left_inverse(self):
+        test = Layout(1, 0)
+        self.helper_test_left_inverse(test)
+
+        test = Layout((1, 1), (0, 0))
+        self.helper_test_left_inverse(test)
+
+        test = Layout(1, 1)
+        self.helper_test_left_inverse(test)
+
+        test = Layout(4, 1)
+        self.helper_test_left_inverse(test)
+
+        test = Layout(4, 2)
+        self.helper_test_left_inverse(test)
+
+        test = Layout((8, 4), (1, 8))
+        self.helper_test_left_inverse(test)
+
+        test = Layout((8, 4), (4, 1))
+        self.helper_test_left_inverse(test)
+
+        test = Layout((2, 4, 6), (1, 2, 8))
+        self.helper_test_left_inverse(test)
+
+        test = Layout((2, 4, 6), (4, 1, 8))
+        self.helper_test_left_inverse(test)
+
+        test = Layout((4, 2), (1, 16))
+        self.helper_test_left_inverse(test)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_pycute/test_right_inverse.py b/test/distributed/_pycute/test_right_inverse.py
new file mode 100644
index 0000000000000..043e86e021a3f
--- /dev/null
+++ b/test/distributed/_pycute/test_right_inverse.py
@@ -0,0 +1,100 @@
+# ruff: noqa: PGH004, G004, F403
+# flake8: noqa
+# Owner(s): ["oncall: distributed"]
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for _pycute.left_inverse
+"""
+
+import logging
+
+from torch.distributed._pycute import *
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestRightInverse(TestCase):
+    def helper_test_right_inverse(self, layout):
+        inv_layout = right_inverse(layout)
+
+        _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+        for i in range(size(inv_layout)):
+            self.assertEqual(layout(inv_layout(i)), i)
+
+    def test_right_inverse(self):
+        test = Layout(1, 0)
+        self.helper_test_right_inverse(test)
+
+        test = Layout((1, 1), (0, 0))
+        self.helper_test_right_inverse(test)
+
+        test = Layout((3, 7), (0, 0))
+        self.helper_test_right_inverse(test)
+
+        test = Layout(1, 1)
+        self.helper_test_right_inverse(test)
+
+        test = Layout(4, 0)
+        self.helper_test_right_inverse(test)
+
+        test = Layout(4, 1)
+        self.helper_test_right_inverse(test)
+
+        test = Layout(4, 2)
+        self.helper_test_right_inverse(test)
+
+        test = Layout((2, 4), (0, 2))
+        self.helper_test_right_inverse(test)
+
+        test = Layout((8, 4), (1, 8))
+        self.helper_test_right_inverse(test)
+
+        test = Layout((8, 4), (4, 1))
+        self.helper_test_right_inverse(test)
+
+        test = Layout((2, 4, 6), (1, 2, 8))
+        self.helper_test_right_inverse(test)
+
+        test = Layout((2, 4, 6), (4, 1, 8))
+        self.helper_test_right_inverse(test)
+
+        test = Layout((4, 2), (1, 16))
+        self.helper_test_right_inverse(test)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_pycute/test_typing.py b/test/distributed/_pycute/test_typing.py
new file mode 100644
index 0000000000000..61f50c08a1add
--- /dev/null
+++ b/test/distributed/_pycute/test_typing.py
@@ -0,0 +1,65 @@
+# flake8: noqa
+# ruff: noqa: PGH004
+# Owner(s): ["oncall: distributed"]
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for _pycute.typing
+"""
+
+import logging
+
+from torch.distributed._pycute import *
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestTyping(TestCase):
+    def helper_test_typing(self, _cls, _obj, cls, expected: bool):
+        _LOGGER.debug(f"issubclass({_cls}, {cls})")
+        _LOGGER.debug(f"isinstance({_obj}, {cls})")
+
+        self.assertEqual(expected, issubclass(_cls, cls))
+        self.assertEqual(expected, isinstance(_obj, cls))
+
+    def test_typing(self):
+        self.helper_test_typing(int, 1, Integer, True)
+        self.helper_test_typing(float, 1.0, Integer, False)
+        self.helper_test_typing(str, "hi", Integer, False)
+        self.helper_test_typing(bool, False, Integer, False)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_quantized_hf_storage.py b/test/distributed/checkpoint/test_quantized_hf_storage.py
index 82d658c271055..c8ee756aaf3f4 100644
--- a/test/distributed/checkpoint/test_quantized_hf_storage.py
+++ b/test/distributed/checkpoint/test_quantized_hf_storage.py
@@ -4,6 +4,7 @@
 from unittest.mock import MagicMock, patch
 
 import torch
+from torch.distributed.checkpoint._hf_utils import _HFStorageInfo
 from torch.distributed.checkpoint.metadata import MetadataIndex
 from torch.distributed.checkpoint.planner import LoadItemType, ReadItem
 from torch.distributed.checkpoint.quantized_hf_storage import (
@@ -57,6 +58,12 @@ def test_dequantization(self):
             scale2_fqn: file2_name,  # Scale in file 2 (different file scenario)
         }
 
+        # Populate the tensor shapes cache that would normally be built by read_metadata()
+        reader._tensor_full_shapes = {
+            weight1_fqn: torch.Size([4, 4]),
+            weight2_fqn: torch.Size([4, 4]),
+        }
+
         # Mock the main safetensors file (file1)
         mock_file1 = MagicMock()
 
@@ -64,9 +71,9 @@ def test_dequantization(self):
         def mock_get_slice(tensor_name):
             mock_tensor = MagicMock()
             if tensor_name == weight1_fqn:
-                mock_tensor.__getitem__ = lambda _, __: quantized_tensor1
+                mock_tensor.__getitem__ = lambda _, _slice: quantized_tensor1
             elif tensor_name == weight2_fqn:
-                mock_tensor.__getitem__ = lambda _, __: quantized_tensor2
+                mock_tensor.__getitem__ = lambda _, _slice: quantized_tensor2
             return mock_tensor
 
         mock_file1.get_slice = mock_get_slice
@@ -162,6 +169,110 @@ def mock_get_slice(tensor_name):
         committed_tensor2 = args2[1]
         torch.testing.assert_close(committed_tensor2, expected_result2)
 
+    def test_dtensor_slice_dequantization_block_alignment(self):
+        """Test DTensor slice dequantization with proper block alignment logic."""
+        reader = QuantizedHuggingFaceStorageReader(
+            self.path,
+            thread_count=1,
+            block_size=4,  # Small block size for easier testing
+        )
+
+        # Create a larger tensor to test multiple blocks
+        # Full tensor is 8x8, block size is 4x4, so we have 2x2 = 4 blocks
+        full_tensor_shape = torch.Size([8, 8])
+
+        # Create quantized tensor data for a slice (rows 2:6, cols 1:5)
+        # This slice spans across multiple blocks
+        slice_tensor = torch.ones(4, 4, dtype=torch.float32) * 2.0
+
+        # Create scale inverse tensor with different values for each block
+        # Scale tensor shape: (2, 2) for 2x2 blocks
+        scale_inv = torch.tensor(
+            [
+                [1.0, 2.0],  # Block (0,0)=1.0, Block (0,1)=2.0
+                [3.0, 4.0],  # Block (1,0)=3.0, Block (1,1)=4.0
+            ],
+            dtype=torch.float32,
+        )
+
+        # Define tensor names
+        weight_fqn = "model.layers.0.attn.q_proj.weight"
+        scale_fqn = "model.layers.0.attn.q_proj.weight_scale_inv"
+        file_name = "model-00001-of-00001.safetensors"
+
+        # Setup mappings
+        reader._weight_scale_mapping = {weight_fqn: scale_fqn}
+        reader._weight_map = {weight_fqn: file_name, scale_fqn: file_name}
+
+        # Mock storage_data to provide tensor shape information
+        reader.storage_data = {
+            MetadataIndex(fqn=weight_fqn, offset=[0, 0]): _HFStorageInfo(
+                relative_path=file_name, shape=full_tensor_shape, dtype=torch.float32
+            )
+        }
+
+        # Populate the tensor shapes cache that would normally be built by read_metadata()
+        reader._tensor_full_shapes = {
+            weight_fqn: full_tensor_shape,
+        }
+
+        # Create ReadItem for a slice that spans multiple blocks
+        # Request slice [2:6, 1:5] from the full 8x8 tensor
+        read_item = ReadItem(
+            type=LoadItemType.TENSOR,
+            storage_index=MetadataIndex(
+                fqn=weight_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            dest_index=MetadataIndex(
+                fqn=weight_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            storage_offsets=[2, 1],  # Start at row 2, col 1
+            dest_offsets=[0, 0],
+            lengths=[4, 4],  # 4x4 slice
+        )
+
+        # Mock safetensors file
+        mock_file = MagicMock()
+
+        # Mock get_slice to return the slice tensor
+        mock_tensor_slice = MagicMock()
+        mock_tensor_slice.__getitem__ = lambda _, _slice: slice_tensor
+        mock_file.get_slice.return_value = mock_tensor_slice
+
+        # Mock get_tensor for scale
+        mock_file.get_tensor.return_value = scale_inv
+
+        # Create target tensor
+        target_tensor = torch.zeros(4, 4, dtype=torch.float32)
+        mock_planner = MagicMock()
+        mock_planner.resolve_tensor.return_value = target_tensor
+
+        # Process the request
+        reader._process_read_request(mock_file, read_item, mock_planner)
+
+        # Verify the result
+        mock_planner.commit_tensor.assert_called_once()
+        args, _ = mock_planner.commit_tensor.call_args
+        committed_tensor = args[1]
+
+        # Expected result calculation:
+        # The slice [2:6, 1:5] intersects with blocks as follows:
+        # - Block (0,0): covers [0:4, 0:4] -> intersection [2:4, 1:4] -> local [0:2, 0:3] with scale 1.0
+        # - Block (0,1): covers [0:4, 4:8] -> intersection [2:4, 4:5] -> local [0:2, 3:4] with scale 2.0
+        # - Block (1,0): covers [4:8, 0:4] -> intersection [4:6, 1:4] -> local [2:4, 0:3] with scale 3.0
+        # - Block (1,1): covers [4:8, 4:8] -> intersection [4:6, 4:5] -> local [2:4, 3:4] with scale 4.0
+
+        expected_result = torch.zeros(4, 4, dtype=torch.float32)
+        # Fill expected values based on block intersections
+        expected_result[0:2, 0:3] = 2.0 * 1.0  # Block (0,0) intersection
+        expected_result[0:2, 3:4] = 2.0 * 2.0  # Block (0,1) intersection
+        expected_result[2:4, 0:3] = 2.0 * 3.0  # Block (1,0) intersection
+        expected_result[2:4, 3:4] = 2.0 * 4.0  # Block (1,1) intersection
+
+        torch.testing.assert_close(committed_tensor, expected_result)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index 517eb8eca367e..11776324ed7f4 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -1,21 +1,21 @@
 #!/usr/bin/env python3
 # Owner(s): ["oncall: r2p"]
 
+import functools
+
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import functools
+# LICENSE file in the root directory  of this source tree.
+import json
 import os
 import signal
 import unittest
 import uuid
 from multiprocessing.pool import ThreadPool
 from typing import Any
-from unittest.mock import call, patch
+from unittest.mock import call, MagicMock, patch
 
 import torch.distributed as dist
 import torch.distributed.elastic.rendezvous.registry as rdzv_registry
@@ -29,6 +29,7 @@
     WorkerSpec,
     WorkerState,
 )
+from torch.distributed.elastic.events import EventSource
 from torch.distributed.elastic.multiprocessing import SignalException
 from torch.distributed.elastic.multiprocessing.errors import ProcessFailure
 from torch.distributed.elastic.rendezvous import RendezvousHandler, RendezvousParameters
@@ -157,6 +158,243 @@ def monres(state: WorkerState):
         return RunResult(state=state)
 
 
+class RecordWorkerEventsTest(unittest.TestCase):
+    def setUp(self):
+        self.spec = MagicMock()
+        self.spec.role = "test_role"
+        self.spec.get_entrypoint_name.return_value = "test_entrypoint"
+        self.spec.rdzv_handler.get_run_id.return_value = "test_run_id"
+        self.spec.rdzv_handler.get_backend.return_value = "test_backend"
+        self.spec.max_restarts = 3
+
+        self.agent = TestAgent(self.spec)
+
+        # Create a mock worker spec and agent
+        self.agent._worker_group = MagicMock()
+        self.agent._worker_group.spec = MagicMock()
+        self.agent._worker_group.spec.event_log_handler = "test_handler"
+
+        # Setup worker group
+        self.worker_group = WorkerGroup(self.spec)
+        self.worker_group.group_world_size = 2
+        self.worker_group.group_rank = 1
+        self.agent._worker_group = self.worker_group
+
+        # Create a test worker
+
+        self.workers = [
+            Worker(
+                local_rank=0,
+                global_rank=0,
+                role_rank=0,
+                world_size=2,
+                role_world_size=2,
+            ),
+            Worker(
+                local_rank=1,
+                global_rank=1,
+                role_rank=1,
+                world_size=2,
+                role_world_size=2,
+            ),
+        ]
+        self.workers[0].id = 0
+        self.workers[1].id = 1
+        self.agent._worker_group.workers = self.workers
+
+    @patch("torch.distributed.elastic.agent.server.api.record")
+    def test_record_worker_events_success(self, mock_record):
+        # Create a RunResult with successful workers
+        result = RunResult(
+            state=WorkerState.SUCCEEDED,
+            return_values={0: "result0", 1: "result1"},
+            failures={},
+        )
+
+        # Call the method under test
+        self.agent._record_worker_events(result)
+
+        # Verify record was called twice (once for each worker)
+        self.assertEqual(mock_record.call_count, 2)
+
+        # Check that both calls were for SUCCEEDED events
+        for call_args in mock_record.call_args_list:
+            event = call_args[0][0]
+
+            self.assertEqual(event.source, EventSource.WORKER)
+            self.assertEqual(event.metadata["state"], "SUCCEEDED")
+            self.assertIsNone(event.metadata["raw_error"])
+            md = json.loads(event.metadata["metadata"])
+            self.assertEqual(md["exit_code"], [None])
+            self.assertEqual(md["worker_pid"], [None])
+
+    @patch("torch.distributed.elastic.agent.server.api.record")
+    def test_record_worker_events_failure(self, mock_record):
+        # Create failures with error data
+        failure0 = ProcessFailure(
+            local_rank=0, pid=1000, exitcode=1, error_file="error0.json"
+        )
+
+        # Create a RunResult with one failed worker and one terminated worker
+        result = RunResult(
+            state=WorkerState.FAILED,
+            return_values={},
+            failures={0: failure0},  # Only worker 0 has a specific failure
+        )
+
+        # Call the method under test
+        self.agent._record_worker_events(result)
+
+        # Verify record was called twice (once for each worker)
+        self.assertEqual(mock_record.call_count, 2)
+
+        # Get the calls
+        calls = mock_record.call_args_list
+
+        # Check first call for the failed worker (global_rank=0)
+        failed_event = calls[0][0][0]
+        self.assertEqual(failed_event.source, EventSource.WORKER)
+        self.assertEqual(failed_event.metadata["state"], "FAILED")
+        self.assertEqual(failed_event.metadata["global_rank"], 0)
+        md = json.loads(failed_event.metadata["metadata"])
+        self.assertEqual(failed_event.metadata["raw_error"], '{"message": "<NONE>"}')
+        self.assertEqual(md["exit_code"], [1])
+        self.assertEqual(md["worker_pid"], [1000])
+
+        # Check second call for the terminated worker (global_rank=1)
+        terminated_event = calls[1][0][0]
+        self.assertEqual(terminated_event.source, EventSource.WORKER)
+        self.assertEqual(terminated_event.metadata["state"], "TERMINATED")
+        self.assertEqual(terminated_event.metadata["global_rank"], 1)
+        self.assertIsNone(terminated_event.metadata["raw_error"])
+        md = json.loads(terminated_event.metadata["metadata"])
+        self.assertEqual(md["exit_code"], [None])
+        self.assertEqual(md["worker_pid"], [None])
+
+
+class ConstructEventTest(unittest.TestCase):
+    def setUp(self):
+        # Create minimal spec and agent for testing
+        self.spec = MagicMock()
+        self.spec.role = "test_role"
+        self.spec.get_entrypoint_name.return_value = "test_entrypoint"
+        self.spec.rdzv_handler.get_run_id.return_value = "test_run_id"
+        self.spec.rdzv_handler.get_backend.return_value = "test_backend"
+        self.spec.max_restarts = 3
+
+        self.agent = TestAgent(self.spec)
+        self.agent._remaining_restarts = 2
+        self.agent._total_execution_time = 42
+
+        # Setup worker group
+        self.worker_group = WorkerGroup(self.spec)
+        self.worker_group.group_world_size = 2
+        self.worker_group.group_rank = 1
+        self.agent._worker_group = self.worker_group
+
+        # Create a test worker
+        self.worker = Worker(
+            local_rank=0, global_rank=5, role_rank=3, world_size=8, role_world_size=4
+        )
+        self.worker.id = 12345
+
+    def test_construct_event_agent_success(self):
+        # Test constructing an agent success event
+        event = self.agent._construct_event(state="SUCCEEDED", source=EventSource.AGENT)
+
+        # Verify basic event properties
+        self.assertEqual(event.name, "torchelastic.worker.status.SUCCEEDED")
+        self.assertEqual(event.source, EventSource.AGENT)
+
+        # Verify metadata
+        metadata = event.metadata
+        self.assertEqual(metadata["run_id"], "test_run_id")
+        self.assertIsNone(metadata["global_rank"])
+        self.assertEqual(metadata["group_rank"], 1)
+        self.assertIsNone(metadata["worker_id"])
+        self.assertEqual(metadata["role"], "test_role")
+        self.assertEqual(metadata["state"], "SUCCEEDED")
+        self.assertEqual(metadata["total_run_time"], 42)
+        self.assertEqual(metadata["rdzv_backend"], "test_backend")
+        self.assertIsNone(metadata["raw_error"])
+        self.assertEqual(
+            metadata["agent_restarts"], 1
+        )  # max_restarts - remaining_restarts
+        self.assertIsNone(metadata["duration_ms"])
+
+        # Verify JSON metadata
+        md_dict = json.loads(metadata["metadata"])
+        self.assertEqual(md_dict["group_world_size"], 2)
+        self.assertEqual(md_dict["entry_point"], "test_entrypoint")
+
+    def test_construct_event_worker_failure(self):
+        # Test constructing a worker failure event with raw error
+        raw_error = json.dumps(
+            {"error_message": "Test error", "traceback": "stack trace"}
+        )
+        event = self.agent._construct_event(
+            state="FAILED",
+            source=EventSource.WORKER,
+            worker=self.worker,
+            raw_error=raw_error,
+            exit_code=1,
+        )
+
+        # Verify basic event properties
+        self.assertEqual(event.name, "torchelastic.worker.status.FAILED")
+        self.assertEqual(event.source, EventSource.WORKER)
+
+        # Verify metadata
+        metadata = event.metadata
+        self.assertEqual(metadata["run_id"], "test_run_id")
+        self.assertEqual(metadata["global_rank"], 5)
+        self.assertEqual(metadata["group_rank"], 1)
+        self.assertEqual(metadata["worker_id"], "12345")
+        self.assertEqual(metadata["role"], "test_role")
+        self.assertEqual(metadata["state"], "FAILED")
+        self.assertEqual(metadata["total_run_time"], 42)
+        self.assertEqual(metadata["rdzv_backend"], "test_backend")
+        self.assertEqual(metadata["raw_error"], raw_error)
+        self.assertEqual(metadata["agent_restarts"], 1)
+
+        # Verify worker-specific metadata
+        md_dict = json.loads(metadata["metadata"])
+        self.assertEqual(md_dict["local_rank"], [0])
+        self.assertEqual(md_dict["role_rank"], [3])
+        self.assertEqual(md_dict["role_world_size"], [4])
+        self.assertEqual(md_dict["exit_code"], [1])
+
+    def test_construct_event_with_duration(self):
+        # Test constructing an event with duration_ms
+        event = self.agent._construct_event(
+            state="RENDEZVOUS", source=EventSource.AGENT, duration_ms=123.45
+        )
+
+        # Verify duration is set correctly
+        self.assertEqual(event.metadata["duration_ms"], 123.45)
+
+    def test_construct_event_worker_no_error(self):
+        # Test constructing a worker event without error info
+        event = self.agent._construct_event(
+            state="HEALTHY", source=EventSource.WORKER, worker=self.worker
+        )
+
+        # Verify error fields are None
+        metadata = event.metadata
+        self.assertIsNone(metadata["raw_error"])
+
+        # Check worker info is set
+        self.assertEqual(metadata["global_rank"], 5)
+        self.assertEqual(metadata["worker_id"], "12345")
+
+        # Check metadata JSON
+        md_dict = json.loads(metadata["metadata"])
+        self.assertEqual(md_dict["local_rank"], [0])
+        self.assertEqual(md_dict["role_rank"], [3])
+        self.assertEqual(md_dict["role_world_size"], [4])
+        self.assertNotIn("exit_code", [None])
+
+
 class SimpleElasticAgentTest(unittest.TestCase):
     def _get_worker_spec(
         self,
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 6e0f273a7c8eb..80c17f6dea995 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -568,9 +568,8 @@ def test_binary_exit(self):
             )
 
             results = pc.wait(period=0.1)
-
             self.assertTrue(results.is_failed())
-            self.assertEqual(1, len(results.failures))
+            self.assertEqual(2, len(results.failures))
 
             failure = results.failures[0]
             self.assertEqual(138, failure.exitcode)
@@ -583,6 +582,13 @@ def test_binary_exit(self):
             self.assertTrue(pc._stderr_tail.stopped())
             self.assertTrue(pc._stdout_tail.stopped())
 
+            failure = results.failures[1]
+            self.assertEqual(-15, failure.exitcode)
+            self.assertEqual("SIGTERM", failure.signal_name())
+            self.assertEqual("<NONE>", failure.error_file_data["message"])
+            # Assert that the failure message contains expected substrings
+            self.assertIn("Signal 15 (SIGTERM) received by PID", failure.message)
+
         def test_binary_raises(self):
             pc = start_processes(
                 name="echo",
diff --git a/test/distributed/elastic/multiprocessing/bin/echo1.py b/test/distributed/elastic/multiprocessing/bin/echo1.py
index 8bcd574e8d855..5ffa5bd904550 100755
--- a/test/distributed/elastic/multiprocessing/bin/echo1.py
+++ b/test/distributed/elastic/multiprocessing/bin/echo1.py
@@ -9,6 +9,7 @@
 import argparse
 import os
 import sys
+import time
 
 
 if __name__ == "__main__":
@@ -23,5 +24,6 @@
         print(f"exit {exitcode} from {rank}", file=sys.stderr)
         sys.exit(exitcode)
     else:
+        time.sleep(1000)
         print(f"{args.msg} stdout from {rank}")
         print(f"{args.msg} stderr from {rank}", file=sys.stderr)
diff --git a/test/distributed/elastic/multiprocessing/test_api.py b/test/distributed/elastic/multiprocessing/test_api.py
new file mode 100644
index 0000000000000..400ec96832c62
--- /dev/null
+++ b/test/distributed/elastic/multiprocessing/test_api.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+# Owner(s): ["oncall: r2p"]
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import signal
+from unittest.mock import MagicMock, patch
+
+from torch.distributed.elastic.multiprocessing.api import (
+    _terminate_process_handler,
+    PContext,
+    SignalException,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class SignalHandlingTest(TestCase):
+    def setUp(self):
+        # Save original environment variable if it exists
+        self.original_signals_env = os.environ.get(
+            "TORCHELASTIC_SIGNALS_TO_HANDLE", None
+        )
+
+    def tearDown(self):
+        # Restore original environment variable
+        if self.original_signals_env is not None:
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = self.original_signals_env
+        elif "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+    def test_terminate_process_handler(self):
+        """Test that the terminate process handler raises SignalException with the correct signal."""
+        signum = signal.SIGTERM
+        with self.assertRaises(SignalException) as cm:
+            _terminate_process_handler(signum, None)
+
+        self.assertEqual(cm.exception.sigval, signal.SIGTERM)
+        # The signal is represented as a number in the string representation
+        self.assertIn(f"Process {os.getpid()} got signal: {signum}", str(cm.exception))
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_registers_default_signals(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method registers the default signals."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Remove environment variable if it exists to test default behavior
+        if "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that the signal handler was registered for the default signals
+        expected_signals = ["SIGTERM", "SIGINT", "SIGHUP", "SIGQUIT"]
+
+        # Count the number of calls to signal.signal
+        signal_calls = 0
+        for call in mock_signal.signal.call_args_list:
+            args, _ = call
+            sig, handler = args
+            signal_calls += 1
+            # Verify the handler is our _terminate_process_handler
+            self.assertEqual(handler, _terminate_process_handler)
+
+        # Verify we registered the expected number of signals
+        self.assertEqual(signal_calls, len(expected_signals))
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+        # Verify _stdout_tail.start() and _stderr_tail.start() were called
+        mock_pcontext._stdout_tail.start.assert_called_once()
+        mock_pcontext._stderr_tail.start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_registers_custom_signals(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method registers custom signals from the environment variable."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Set custom signals in the environment variable
+        os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGTERM,SIGUSR1,SIGUSR2"
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that the signal handler was registered for the custom signals
+        expected_signals = ["SIGTERM", "SIGUSR1", "SIGUSR2"]
+
+        # Count the number of calls to signal.signal
+        signal_calls = 0
+        for call in mock_signal.signal.call_args_list:
+            args, _ = call
+            sig, handler = args
+            signal_calls += 1
+            # Verify the handler is our _terminate_process_handler
+            self.assertEqual(handler, _terminate_process_handler)
+
+        # Verify we registered the expected number of signals
+        self.assertEqual(signal_calls, len(expected_signals))
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_handles_invalid_signals(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method handles invalid signals gracefully."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Set invalid signals in the environment variable
+        os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGTERM,INVALID_SIGNAL"
+
+        # Mock the signal module to not have the INVALID_SIGNAL attribute
+        # but have SIGTERM
+        mock_signal.SIGTERM = signal.SIGTERM
+        # Remove INVALID_SIGNAL attribute if it exists
+        if hasattr(mock_signal, "INVALID_SIGNAL"):
+            delattr(mock_signal, "INVALID_SIGNAL")
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that the warning was logged for the invalid signal
+        # The exact message may vary, so let's check if warning was called with INVALID_SIGNAL
+        warning_calls = [
+            call
+            for call in mock_logger.warning.call_args_list
+            if "INVALID_SIGNAL" in str(call)
+        ]
+        self.assertTrue(len(warning_calls) > 0, "Expected warning about INVALID_SIGNAL")
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_handles_windows_signals(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method handles Windows-specific signal behavior."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Set signals including ones not supported on Windows
+        os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGTERM,SIGHUP,SIGUSR1"
+
+        # Mock signal attributes
+        mock_signal.SIGTERM = signal.SIGTERM
+        mock_signal.SIGHUP = signal.SIGHUP
+        mock_signal.SIGUSR1 = signal.SIGUSR1
+
+        # Mock IS_WINDOWS to be True
+        with patch("torch.distributed.elastic.multiprocessing.api.IS_WINDOWS", True):
+            # Mock signal.signal to raise RuntimeError for Windows-unsupported signals
+            def signal_side_effect(sig, handler):
+                if sig in [signal.SIGHUP, signal.SIGUSR1]:
+                    raise RuntimeError("Signal not supported on Windows")
+
+            mock_signal.signal.side_effect = signal_side_effect
+
+            # Call the start method
+            PContext.start(mock_pcontext)
+
+            # Verify that the info was logged for the unsupported signals
+            # Check if any info calls contain the expected messages
+            info_calls = [str(call) for call in mock_logger.info.call_args_list]
+            sighup_logged = any(
+                "SIGHUP" in call and "Windows" in call for call in info_calls
+            )
+            sigusr1_logged = any(
+                "SIGUSR1" in call and "Windows" in call for call in info_calls
+            )
+
+            self.assertTrue(
+                sighup_logged,
+                f"Expected SIGHUP Windows message in info calls: {info_calls}",
+            )
+            self.assertTrue(
+                sigusr1_logged,
+                f"Expected SIGUSR1 Windows message in info calls: {info_calls}",
+            )
+
+            # Verify _start was called
+            mock_pcontext._start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_not_main_thread(self, mock_logger, mock_threading):
+        """Test that the start method warns when not called from the main thread."""
+        # Setup
+        mock_threading.current_thread.return_value = MagicMock()  # Not the main thread
+        mock_threading.main_thread.return_value = MagicMock()
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that the warning was logged
+        mock_logger.warning.assert_called_with(
+            "Failed to register signal handlers since torchelastic is running on a child thread. "
+            "This could lead to orphaned worker processes if the torchrun is terminated."
+        )
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_supports_sigusr1_and_sigusr2(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method properly supports SIGUSR1 and SIGUSR2 signals."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Set environment variable to include SIGUSR1 and SIGUSR2
+        os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGUSR1,SIGUSR2"
+
+        # Mock signal attributes to have SIGUSR1 and SIGUSR2
+        mock_signal.SIGUSR1 = signal.SIGUSR1
+        mock_signal.SIGUSR2 = signal.SIGUSR2
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that signal.signal was called for both SIGUSR1 and SIGUSR2
+        signal_calls = mock_signal.signal.call_args_list
+        registered_signals = [
+            call[0][0] for call in signal_calls
+        ]  # Extract the signal from each call
+
+        # Verify both SIGUSR1 and SIGUSR2 were registered
+        self.assertIn(
+            signal.SIGUSR1, registered_signals, "SIGUSR1 should be registered"
+        )
+        self.assertIn(
+            signal.SIGUSR2, registered_signals, "SIGUSR2 should be registered"
+        )
+
+        # Verify the correct handler was registered for both signals
+        for call in signal_calls:
+            sig, handler = call[0]
+            if sig in [signal.SIGUSR1, signal.SIGUSR2]:
+                self.assertEqual(
+                    handler,
+                    _terminate_process_handler,
+                    f"Signal {sig} should use _terminate_process_handler",
+                )
+
+        # Verify that info messages were logged for successful registration
+        info_calls = [str(call) for call in mock_logger.info.call_args_list]
+        sigusr1_logged = any(
+            "SIGUSR1" in call and "Registered signal handler" in call
+            for call in info_calls
+        )
+        sigusr2_logged = any(
+            "SIGUSR2" in call and "Registered signal handler" in call
+            for call in info_calls
+        )
+
+        self.assertTrue(
+            sigusr1_logged,
+            f"Expected SIGUSR1 registration message in info calls: {info_calls}",
+        )
+        self.assertTrue(
+            sigusr2_logged,
+            f"Expected SIGUSR2 registration message in info calls: {info_calls}",
+        )
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+        # Verify _stdout_tail.start() and _stderr_tail.start() were called
+        mock_pcontext._stdout_tail.start.assert_called_once()
+        mock_pcontext._stderr_tail.start.assert_called_once()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/elastic/timer/file_based_local_timer_test.py b/test/distributed/elastic/timer/file_based_local_timer_test.py
index 39d215f9319b7..cf597eb6a37aa 100644
--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@@ -15,6 +15,7 @@
 
 import torch.distributed.elastic.timer as timer
 from torch.testing._internal.common_utils import (
+    IS_ARM64,
     IS_MACOS,
     IS_WINDOWS,
     run_tests,
@@ -23,8 +24,8 @@
 )
 
 
-# timer is not supported on windows or macos
-if not (IS_WINDOWS or IS_MACOS):
+# timer is not supported on these platforms
+if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
     # func2 should time out
     def func2(n, file_path):
         if file_path is not None:
diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py
index 48907bca7b18d..09421f4b38f54 100644
--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@@ -14,6 +14,7 @@
 import torch.distributed.elastic.timer as timer
 import torch.multiprocessing as torch_mp
 from torch.testing._internal.common_utils import (
+    IS_ARM64,
     IS_MACOS,
     IS_WINDOWS,
     run_tests,
@@ -40,8 +41,8 @@ def _stuck_function(rank, mp_queue):
         time.sleep(5)
 
 
-# timer is not supported on macos or windows
-if not (IS_WINDOWS or IS_MACOS):
+# timer is not supported on these platforms
+if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
 
     class LocalTimerExample(TestCase):
         """
diff --git a/test/distributed/elastic/timer/local_timer_test.py b/test/distributed/elastic/timer/local_timer_test.py
index 591a5d7e7c29d..b65b202d5ec6c 100644
--- a/test/distributed/elastic/timer/local_timer_test.py
+++ b/test/distributed/elastic/timer/local_timer_test.py
@@ -15,6 +15,7 @@
 from torch.distributed.elastic.timer.api import TimerRequest
 from torch.distributed.elastic.timer.local_timer import MultiprocessingRequestQueue
 from torch.testing._internal.common_utils import (
+    IS_ARM64,
     IS_MACOS,
     IS_WINDOWS,
     run_tests,
@@ -24,8 +25,10 @@
 )
 
 
-# timer is not supported on windows or macos
-if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN):
+# timer is not supported on these platforms
+INVALID_PLATFORMS = IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN or IS_ARM64
+
+if not INVALID_PLATFORMS:
     # func2 should time out
     def func2(n, mp_queue):
         if mp_queue is not None:
@@ -129,8 +132,7 @@ def _enqueue_on_interval(mp_queue, n, interval, sem):
             time.sleep(interval)
 
 
-# timer is not supported on windows or macos
-if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN):
+if not INVALID_PLATFORMS:
 
     class MultiprocessingRequestQueueTest(TestCase):
         def test_get(self):
@@ -197,8 +199,7 @@ def test_get_less_than_size(self):
             self.assertLessEqual(n / 2, len(requests))
 
 
-# timer is not supported on windows or macos
-if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN):
+if not INVALID_PLATFORMS:
 
     class LocalTimerServerTest(TestCase):
         def setUp(self):
diff --git a/test/distributed/fsdp/test_fsdp_comm_hooks.py b/test/distributed/fsdp/test_fsdp_comm_hooks.py
index 624e74d373686..6a204d5eb22fe 100644
--- a/test/distributed/fsdp/test_fsdp_comm_hooks.py
+++ b/test/distributed/fsdp/test_fsdp_comm_hooks.py
@@ -34,11 +34,7 @@
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
-# bfloat16 is only supported by CUDA 11+ or XPU
-BFLOAT16_AVAILABLE = (
-    torch.cuda.is_available()
-    and (torch.version.cuda is not None or torch.version.hip is not None)
-) or torch.xpu.is_available()
+BFLOAT16_AVAILABLE = torch.cuda.is_bf16_supported() or torch.xpu.is_bf16_supported()
 
 
 class Net(nn.Module):
diff --git a/test/distributed/fsdp/test_fsdp_overlap.py b/test/distributed/fsdp/test_fsdp_overlap.py
index d076563750e63..f9a43e748c464 100644
--- a/test/distributed/fsdp/test_fsdp_overlap.py
+++ b/test/distributed/fsdp/test_fsdp_overlap.py
@@ -8,8 +8,7 @@
 
 import torch
 import torch.nn as nn
-from torch import distributed as dist
-from torch.cuda import Event
+from torch import distributed as dist, Event
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -19,6 +18,8 @@
     run_tests,
     TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
+    TEST_XPU,
+    xfailIf,
 )
 
 
@@ -33,6 +34,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 class Layer(nn.Module):
     def __init__(self, compute_cycles, has_params: bool):
@@ -50,7 +53,8 @@ def forward(self, x):
         # Record the fake forward compute time.
         self.e1.record()
         if self.sleep_cycles > 0:
-            torch.cuda._sleep(self.sleep_cycles)
+            if torch.cuda.is_available():
+                torch.cuda._sleep(self.sleep_cycles)
         if self.optional_param is not None:
             x = x + self.optional_param  # force the param to be part of the graph
         self.e2.record()
@@ -72,7 +76,7 @@ def _create_model(compute_cycles, has_params: bool):
             FSDP(Layer(compute_cycles, has_params), limit_all_gathers=False),
         ),
         limit_all_gathers=False,
-    ).cuda()
+    ).to(device_type)
     return model
 
 
@@ -110,7 +114,7 @@ def run(compute_cycles, all_gather_cycles):
 
             # Get the input and sets the input's requires_grad to True because
             # we have a fake compute in the forward pass.
-            batch = torch.rand(1).cuda()
+            batch = torch.rand(1).to(device_type)
             batch.requires_grad = True
 
             # Run one dummy iteration to trigger the execution order validation
@@ -137,7 +141,8 @@ def run(compute_cycles, all_gather_cycles):
                 def _delayed_all_gather(*args, **kwargs):
                     nonlocal all_gather_called
                     all_gather_called = True
-                    torch.cuda._sleep(all_gather_cycles)
+                    if torch.cuda.is_available():
+                        torch.cuda._sleep(all_gather_cycles)
                     assert orig_all_gather
                     return orig_all_gather(*args, **kwargs)
 
@@ -245,6 +250,7 @@ def _delayed_all_gather(*args, **kwargs):
             self.assertTrue(compute_only + all_gather_only > 1.1 * both)
 
     @unittest.skipIf(TEST_HPU, "HPU doesn't has HW sleep API support, skipping")
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1504
     @skip_if_lt_x_gpu(2)
     def test_forward_overlap(self):
         self._dist_train()
@@ -256,9 +262,9 @@ def world_size(self):
         return 2
 
 
-devices = ("cuda", "hpu")
+devices = ("cuda", "hpu", "xpu")
 instantiate_device_type_tests(
-    TestForwardOverlapWorldSizeOne, globals(), only_for=devices
+    TestForwardOverlapWorldSizeOne, globals(), only_for=devices, allow_xpu=True
 )
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py
index 9ec55f22c54b2..20c2f927651f6 100644
--- a/test/distributed/fsdp/test_fsdp_pure_fp16.py
+++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py
@@ -152,6 +152,6 @@ def _test_fp16_dtypes(
 
 
 devices = ("cuda", "hpu", "xpu")
-instantiate_device_type_tests(TestPureFP16, globals(), only_for=devices)
+instantiate_device_type_tests(TestPureFP16, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
index 89cd8c8deff8f..f14b9e0480cca 100644
--- a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
+++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
@@ -35,6 +35,7 @@
     parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
+    TEST_XPU,
     TestCase,
     NAVI_ARCH,
     skipIfRocmArch,
@@ -53,6 +54,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 params = "cpu_offload,sharding_strategy,mixed_precision,use_orig_params"
 cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
@@ -78,11 +81,14 @@
 
 class TestShardGradScaler(TestCase):
     @unittest.skipIf(
-        amp_definitely_not_available(), "no supported device (cuda, xla) found"
+        amp_definitely_not_available() and not TEST_XPU,
+        "no supported device (cuda, xla, xpu) found",
     )
     def test_grad_scaling(self):
         pg = DummyProcessGroup(0, 1)
-        scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
+        scaler = ShardedGradScaler(
+            device=device_type, init_scale=2.0, process_group=pg, enabled=True
+        )
         t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cpu")
         t1 = torch.full((1,), 8.0, dtype=torch.float32, device="cpu")
         outputs = [t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), t1.clone()]]
@@ -94,11 +100,14 @@ def test_grad_scaling(self):
         self.assertTrue(scaler._scale.device == t1.device)
 
     @unittest.skipIf(
-        amp_definitely_not_available(), "no supported device (cuda, xla) found"
+        amp_definitely_not_available() and not TEST_XPU,
+        "no supported device (cuda, xla, xpu) found",
     )
     def test_scaling_unscaling_sparse(self):
         pg = DummyProcessGroup(0, 1)
-        scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
+        scaler = ShardedGradScaler(
+            device=device_type, init_scale=2.0, process_group=pg, enabled=True
+        )
         inv_scale = torch.full((1,), 0.5, dtype=torch.float, device="cpu")
         found_inf = torch.full((1,), 0, dtype=torch.float, device="cpu")
 
@@ -139,11 +148,14 @@ def test_scaling_unscaling_sparse(self):
         self.assertEqual(found_inf, 1.0)
 
     @unittest.skipIf(
-        amp_definitely_not_available(), "no supported device (cuda, xla) found"
+        amp_definitely_not_available() and not TEST_XPU,
+        "no supported device (cuda, xla, xpu) found",
     )
     def test_inf_gradients_skip_optim_step(self):
         pg = DummyProcessGroup(0, 1)
-        scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
+        scaler = ShardedGradScaler(
+            device=device_type, init_scale=2.0, process_group=pg, enabled=True
+        )
         loss = torch.full((1,), 4.0, dtype=torch.float32, device="cpu")
         t0 = torch.tensor([float("inf")], dtype=torch.float32, device="cpu")
         t0.grad = t0.clone()
@@ -230,8 +242,9 @@ def _build_model_and_optim(
                 {
                     TransformerEncoderLayer,
                     TransformerDecoderLayer,
-                }
+                },
             ),
+            "device_id": self.rank,
         }
         model = FSDP(model, **fsdp_kwargs)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
@@ -260,10 +273,10 @@ def _test_sharded_grad_scaler_found_inf(
             cpu_offload=cpu_offload,
             use_orig_params=use_orig_params,
         )
-        grad_scaler = ShardedGradScaler(init_scale=2.0)
-        ref_grad_scaler = torch.amp.GradScaler(device="cuda", init_scale=2.0)
+        grad_scaler = ShardedGradScaler(device=device_type, init_scale=2.0)
+        ref_grad_scaler = torch.amp.GradScaler(device=device_type, init_scale=2.0)
         scaled_losses: list[torch.Tensor] = []
-        device = torch.device("cuda")
+        device = torch.device(device_type)
         torch.manual_seed(42 + self.rank + 1)
 
         for iter in range(10):
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index b76bbfd8b91f7..b0677655186a6 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -97,6 +97,8 @@
     "sharded_state_dict": StateDictType.SHARDED_STATE_DICT,
 }
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 class Model(Module):
     def __init__(
@@ -155,13 +157,13 @@ def forward(self, x):
         return self.net3(self.net2(self.net1(x)))
 
     def get_input(self):
-        return torch.rand(8, 8, device="cuda")
+        return torch.rand(8, 8, device=device_type)
 
 
 class TestFSDPStateDict(FSDPTest):
     @property
     def world_size(self):
-        return min(torch.cuda.device_count(), 2)
+        return min(torch.accelerator.device_count(), 2)
 
     def _broadcast_state_dict(self, state_dict):
         return _broadcast_state_dict(self.rank, state_dict)
@@ -196,8 +198,8 @@ def _get_simple_nested_model(
         self, *fsdp_args, wrap=True, checkpoint_wrap=False, **fsdp_kwargs
     ):
         if wrap:
-            lin1 = nn.Linear(10, 10, bias=False).cuda()
-            lin2 = nn.Linear(10, 10, bias=False).cuda()
+            lin1 = nn.Linear(10, 10, bias=False).to(device_type)
+            lin2 = nn.Linear(10, 10, bias=False).to(device_type)
             if checkpoint_wrap:
                 lin1 = checkpoint_wrapper(lin1)
                 lin2 = checkpoint_wrapper(lin2)
@@ -207,13 +209,13 @@ def _get_simple_nested_model(
             model = FSDP(seq, *fsdp_args, **fsdp_kwargs)
         else:
             model = nn.Sequential(
-                nn.Linear(10, 10, bias=False).cuda(),
-                nn.Linear(10, 10, bias=False).cuda(),
+                nn.Linear(10, 10, bias=False).to(device_type),
+                nn.Linear(10, 10, bias=False).to(device_type),
             )
         return model
 
     def _get_simple_model(self, *fsdp_args, checkpoint_wrap=False, **fsdp_kwargs):
-        lin = nn.Linear(10, 10, bias=False).cuda()
+        lin = nn.Linear(10, 10, bias=False).to(device_type)
         if checkpoint_wrap:
             lin = checkpoint_wrapper(lin)
         model = FSDP(lin, *fsdp_args, **fsdp_kwargs)
@@ -230,9 +232,9 @@ def _get_multibuffer_nested_model(
             else None
         )
         if wrap:
-            lin1 = nn.Linear(10, 10, bias=False).cuda()
-            bn1 = nn.BatchNorm1d(10).cuda()
-            lin2 = nn.Linear(10, 10, bias=False).cuda()
+            lin1 = nn.Linear(10, 10, bias=False).to(device_type)
+            bn1 = nn.BatchNorm1d(10).to(device_type)
+            lin2 = nn.Linear(10, 10, bias=False).to(device_type)
             if checkpoint_wrap:
                 lin1 = checkpoint_wrapper(lin1)
                 bn1 = checkpoint_wrapper(bn1)
@@ -247,9 +249,9 @@ def _get_multibuffer_nested_model(
             model = FSDP(seq, *fsdp_args, **fsdp_kwargs)
         else:
             model = nn.Sequential(
-                nn.Linear(10, 10, bias=False).cuda(),
-                nn.BatchNorm1d(10).cuda(),
-                nn.Linear(10, 10, bias=False).cuda(),
+                nn.Linear(10, 10, bias=False).to(device_type),
+                nn.BatchNorm1d(10).to(device_type),
+                nn.Linear(10, 10, bias=False).to(device_type),
             )
         return model
 
@@ -257,7 +259,7 @@ def _get_non_fsdp_root_module(self, *fsdp_args, wrap=True, **fsdp_kwargs):
         class FSDPContainer(nn.Module):
             def __init__(self, fsdp_1, fsdp_2):
                 super().__init__()
-                self.non_fsdp_lin = nn.Linear(10, 10, bias=False).cuda()
+                self.non_fsdp_lin = nn.Linear(10, 10, bias=False).to(device_type)
                 self.fsdp_1 = fsdp_1
                 self.fsdp_2 = fsdp_2
 
@@ -505,7 +507,7 @@ def test_state_dict_rank0_offload_save_load_flow(self, use_orig_params: bool):
         # Broadcast the module states from rank 0 with `sync_module_states=True`
         new_fsdp_model = FSDP(
             new_model,
-            device_id=torch.cuda.current_device(),
+            device_id=torch.accelerator.current_device_index(),
             auto_wrap_policy=auto_wrap_policy,
             sync_module_states=True,
         )
@@ -602,7 +604,7 @@ def test_basic_save_and_load_state_dict(
 
             model_new = model_call()
             if not cpu_offload.offload_params:
-                model_new = model_new.cuda()
+                model_new = model_new.to(device_type)
             if fp16:
                 model_new.half()
             # Run a forward/backward to compute gradients to test the case
@@ -677,7 +679,7 @@ def test_buffers_save_and_load_state_dict(
 
         model_new = model_call()
         if not cpu_offload.offload_params:
-            model_new = model_new.cuda()
+            model_new = model_new.to(device_type)
 
         # zero the model to ensure parameters are different.
         _zero_model(model_new, zero_buffers=True)
@@ -704,7 +706,7 @@ def test_save_and_load_after_forward_state_dict(
         """
         if state_dict_rank0_and_offload and state_dict_type != "state_dict":
             return
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         mixed_precision = (
             MixedPrecision(
                 param_dtype=torch.float16,
@@ -718,7 +720,7 @@ def test_save_and_load_after_forward_state_dict(
         optim = torch.optim.SGD(model.parameters(), lr=0.1)
         initial_params = get_full_params(model)
         for _ in range(6):
-            inp = torch.randn(1, 10, device=torch.cuda.current_device())
+            inp = torch.randn(1, 10, device=torch.accelerator.current_device_index())
             output = model(*inp)
             loss = output.sum()
             expected_dtype = torch.float32 if mixed_precision is None else torch.float16
@@ -768,7 +770,7 @@ def _initialize_model(
         # keep everything deterministic for input data
         torch.manual_seed(0)
 
-        model = Model(wrap_fsdp, register_buffers=register_buffers).cuda()
+        model = Model(wrap_fsdp, register_buffers=register_buffers).to(device_type)
         if wrap_fsdp:
             model = FSDP(model)
         elif wrap_ddp:
@@ -804,7 +806,9 @@ def _dist_train(
         model = self._initialize_model(wrap_fsdp)
         optim = SGD(model.parameters(), lr=0.1)
 
-        in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda"))
+        in_data = torch.rand(
+            64, 4, requires_grad=True, device=torch.device(device_type)
+        )
         for _ in range(3):
             out = model(in_data)
             out.sum().backward()
@@ -812,7 +816,7 @@ def _dist_train(
             optim.zero_grad()
 
         if wrap_fsdp:
-            blank_model = FSDP(Model(True).cuda())
+            blank_model = FSDP(Model(True).to(device_type))
             _zero_model(blank_model)
             state_dict = self._state_dict(model, state_dict_type)
             if move_to_cpu:
@@ -884,10 +888,12 @@ def test_state_dict_load_into_local_module(
         optim = SGD(model.parameters(), lr=0.1)
         if not fsdp_root:
             in_data = torch.randn(
-                1, 10, requires_grad=True, device=torch.device("cuda")
+                1, 10, requires_grad=True, device=torch.device(device_type)
             )
         else:
-            in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda"))
+            in_data = torch.rand(
+                64, 4, requires_grad=True, device=torch.device(device_type)
+            )
         for _ in range(3):
             out = model(in_data)
             out.sum().backward()
@@ -943,7 +949,7 @@ def test_state_dict_load_into_local_module(
     @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
     @parametrize("double_nest", [True])
     def test_state_dict_skip_module(self, state_dict_type, double_nest):
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
 
         def _create_module(wrap_fsdp=True):
             LINEAR_SKIP = "linear_skip"
@@ -968,7 +974,7 @@ def _create_module(wrap_fsdp=True):
 
         fsdp, _ = _create_module()
         # Run a forward pass
-        inp = torch.randn((1, 10), device=torch.cuda.current_device())
+        inp = torch.randn((1, 10), device=torch.accelerator.current_device_index())
         loss = fsdp(inp)
         loss.sum().backward()
 
@@ -1016,7 +1022,7 @@ def _create_module(wrap_fsdp=True):
 
     @skip_if_lt_x_gpu(2)
     def test_wrong_state_dict_config(self):
-        model = FSDP(Model(wrap_fsdp=True).cuda())
+        model = FSDP(Model(wrap_fsdp=True).to(device_type))
         with self.assertRaisesRegex(RuntimeError, "Expected state_dict_config of type"):
             with model.state_dict_type(
                 model, StateDictType.FULL_STATE_DICT, LocalStateDictConfig()
@@ -1038,7 +1044,7 @@ def test_state_dict_with_ignored_modules(
             register_buffers=True,
             ignore_inner=ignore_inner,
             mixed_precision=mixed_precision,
-        ).cuda()
+        ).to(device_type)
         ignored_modules = [model.outer]
         ignored_tensor_to_tensor_name = {
             model.outer.bias: "outer.bias",
@@ -1097,7 +1103,7 @@ def test_state_dict_with_ignored_modules(
             self.assertEqual(sd1[prefixed_buffer_name].dtype, torch.float32)
         # Check that the state dict can be loaded into a non-wrapped version of
         # the model
-        nonwrapped_model = Model(wrap_fsdp=False, register_buffers=True).cuda()
+        nonwrapped_model = Model(wrap_fsdp=False, register_buffers=True).to(device_type)
         for param in nonwrapped_model.parameters():
             with torch.no_grad():
                 param.zero_()
@@ -1144,7 +1150,7 @@ def __init__(self) -> None:
             def forward(self, x):
                 return self.my_parameter
 
-        model = FSDP(Model().cuda())
+        model = FSDP(Model().to(device_type))
         with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
             out = model(None)
             out.backward()
@@ -1153,7 +1159,7 @@ def forward(self, x):
             with torch.no_grad():
                 with FSDP.summon_full_params(model):
                     self.assertEqual(model.my_parameter.item(), 3.1415926)
-                    model.my_parameter.copy_(torch.full((1,), 1.75).cuda())
+                    model.my_parameter.copy_(torch.full((1,), 1.75).to(device_type))
                     self.assertEqual(model.my_parameter.item(), 1.75)
             model.load_state_dict(state_dict)
             with FSDP.summon_full_params(model):
@@ -1161,7 +1167,7 @@ def forward(self, x):
 
     @skip_if_lt_x_gpu(2)
     def test_torch_save_load(self):
-        model = Model(wrap_fsdp=True).cuda()
+        model = Model(wrap_fsdp=True).to(device_type)
         with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
             state_dict = model.state_dict()
             checkpoint = io.BytesIO()
@@ -1192,7 +1198,7 @@ def test_torch_save_load(self):
 
     @skip_if_lt_x_gpu(2)
     def test_shared_module_and_shared_parameter(self):
-        model = FSDP(TestDummyModel().cuda())
+        model = FSDP(TestDummyModel().to(device_type))
         with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT):
             state_dict = model.state_dict()
             self.assertEqual(
@@ -1226,7 +1232,8 @@ def test_sharded_load_multi_backend_pg(self):
         }
         for load_cpu in [True, False]:
             with self.subTest(load_cpu=load_cpu):
-                pg = dist.new_group(backend="cpu:gloo,cuda:nccl")
+                backend = torch.distributed.get_default_backend_for_device(device_type)
+                pg = dist.new_group(backend=f"cpu:gloo,{device_type}:{backend}")
                 fsdp_model = TransformerWithSharedParams.init(
                     pg,
                     FSDPInitMode.RECURSIVE,
@@ -1272,7 +1279,7 @@ def test_world_size_one(self):
 class TestFSDPStateDict4GPUs(FSDPTest):
     @property
     def world_size(self):
-        return torch.cuda.device_count()
+        return torch.accelerator.device_count()
 
     @skip_if_lt_x_gpu(4)
     def test_local_state_dict_reshard(self):
@@ -1282,10 +1289,10 @@ def test_local_state_dict_reshard(self):
         local_state_dict, there are still some corner cases that
         using local_state_dict is a better solution.
         """
-        model = FSDP(Model(wrap_fsdp=True)).cuda()
+        model = FSDP(Model(wrap_fsdp=True)).to(device_type)
         optim = torch.optim.SGD(model.parameters(), lr=0.1)
 
-        batch = torch.randn(4, 4, device=torch.cuda.current_device())
+        batch = torch.randn(4, 4, device=torch.accelerator.current_device_index())
         output = model(batch)
         loss = output.sum()
         loss.backward()
@@ -1319,7 +1326,7 @@ def test_local_state_dict_reshard(self):
         if rank < 2:
             model2 = FSDP(
                 Model(wrap_fsdp=True, process_group=new_pg), process_group=new_pg
-            ).cuda()
+            ).to(device_type)
             with FSDP.state_dict_type(model2, StateDictType.LOCAL_STATE_DICT):
                 model2.load_state_dict(resharded_state_dict)
 
diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
index 2cc3858e12696..4577848337317 100644
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -49,6 +49,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 class SimpleModel(torch.nn.Module):
     def __init__(self) -> None:
@@ -119,7 +121,7 @@ def _get_sub_pgs(self, tensor_parallel_size: int):
         """
         # 2-D mesh is [dp, tp]
         twod_mesh = DeviceMesh(
-            device_type="cuda",
+            device_type=device_type,
             mesh=torch.arange(0, self.world_size).view(-1, tensor_parallel_size),
         )
 
@@ -166,7 +168,7 @@ def _sync_tp_grads(
                 self.rank // tp_world_size
             ]
             grad_device = flat_param.grad.device
-            grad = flat_param.grad.detach().clone().cuda(self.rank)
+            grad = flat_param.grad.detach().clone().to(self.rank)
             dist.all_reduce(grad, op=dist.ReduceOp.SUM, group=tp_pg)
             grad = grad.to(grad_device)
             flat_param.grad[~sharded_mask] = grad[~sharded_mask]
@@ -199,7 +201,7 @@ def _get_grads_as_flattened(
                 ]
             )
             .contiguous()
-            .cuda(self.rank)
+            .to(self.rank)
         )
         all_grads_as_flattened = torch.cat(
             [torch.empty_like(local_grads_as_flattened) for _ in range(fsdp_pg.size())]
@@ -252,7 +254,7 @@ def _test_fsdp_tp_integration(
         tensor_parallel_size = 2
         LR = 3e-5
         torch.manual_seed(0)
-        model = SimpleModel().cuda(self.rank)
+        model = SimpleModel().to(self.rank)
         tp_fsdp_model = copy.deepcopy(model)
         sharded_param_names = SimpleModel.get_sharded_param_names()
         non_sharded_param_names = SimpleModel.get_non_sharded_param_names()
@@ -268,10 +270,10 @@ def _test_fsdp_tp_integration(
         input_seed = self.rank
         torch.manual_seed(input_seed + 1)
         inp_size = [2, 3, 5]
-        inp = torch.rand(*inp_size).cuda(self.rank)
+        inp = torch.rand(*inp_size).to(self.rank)
         self.assertEqual(model(inp), tp_fsdp_model(inp))  # sanity check
 
-        mesh_1d = init_device_mesh("cuda", (self.world_size,))
+        mesh_1d = init_device_mesh(device_type, (self.world_size,))
         fsdp_model = FSDP(
             model,
             cpu_offload=cpu_offload,
@@ -280,7 +282,7 @@ def _test_fsdp_tp_integration(
             use_orig_params=use_orig_params,
         )
         mesh_2d = init_device_mesh(
-            "cuda",
+            device_type,
             (self.world_size // tensor_parallel_size, tensor_parallel_size),
             mesh_dim_names=["dp", "tp"],
         )
@@ -346,7 +348,7 @@ def _test_fsdp_tp_integration(
         fsdp_optim.step()
         tp_fsdp_optim.step()
         torch.manual_seed(input_seed + 16)
-        inp = torch.rand(*inp_size).cuda(self.rank)
+        inp = torch.rand(*inp_size).to(self.rank)
         fsdp_out = fsdp_model(inp)
         tp_fsdp_out = tp_fsdp_model(inp)
         self.assertEqual(fsdp_out, tp_fsdp_out)
@@ -357,19 +359,19 @@ def test_fsdp_tp_extension_grad(self):
         Tests TP + FSDP extension with correct gradient (i.e. no ACT)
         """
         mesh_2d = init_device_mesh(
-            "cuda", (self.world_size // 2, 2), mesh_dim_names=["dp", "tp"]
+            device_type, (self.world_size // 2, 2), mesh_dim_names=["dp", "tp"]
         )
 
         class TestModel(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
-                self.mlp = MLPModule("cuda")
+                self.mlp = MLPModule(device_type)
                 self.mlp_norm = RMSNormPython(10)
 
             def forward(self, x):
                 return self.mlp(self.mlp_norm(x))
 
-        model = TestModel().cuda(self.rank)
+        model = TestModel().to(self.rank)
 
         # Shard with TP and test gradient
         tp_mesh = mesh_2d["tp"]
@@ -387,7 +389,7 @@ def forward(self, x):
         comm_mode = CommDebugMode()
 
         with comm_mode:
-            fsdp_2d_model(torch.rand(2, 10).cuda(self.rank)).sum().backward()
+            fsdp_2d_model(torch.rand(2, 10).to(self.rank)).sum().backward()
 
         funcol = torch.ops.c10d_functional
         c10d_ops = torch.ops.c10d
@@ -409,7 +411,7 @@ def forward(self, x):
     @skip_if_lt_x_gpu(4)
     def test_fsdp_tp_sync_module_state(self):
         mesh_2d = init_device_mesh(
-            "cuda", (self.world_size // 2, 2), mesh_dim_names=["dp", "tp"]
+            device_type, (self.world_size // 2, 2), mesh_dim_names=["dp", "tp"]
         )
         tp_mesh = mesh_2d["tp"]
         dp_mesh = mesh_2d["dp"]
diff --git a/test/distributed/fsdp/test_fsdp_traversal.py b/test/distributed/fsdp/test_fsdp_traversal.py
index 6cd3188293df3..ad93d9a17829f 100644
--- a/test/distributed/fsdp/test_fsdp_traversal.py
+++ b/test/distributed/fsdp/test_fsdp_traversal.py
@@ -29,8 +29,8 @@
 class TestTraversal(FSDPTest):
     @property
     def world_size(self):
-        if torch.cuda.is_available():
-            gpu_cnt = torch.cuda.device_count()
+        if torch.torch.accelerator.is_available():
+            gpu_cnt = torch.accelerator.device_count()
             if gpu_cnt < 2:
                 return gpu_cnt
         return 2
@@ -62,6 +62,8 @@ def test_fsdp_modules(self):
 
 
 devices = ("cuda", "hpu", "xpu")
-instantiate_device_type_tests(TestTraversal, globals(), only_for=devices)
+instantiate_device_type_tests(
+    TestTraversal, globals(), only_for=devices, allow_xpu=True
+)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index 7efe6ec6661ca..d0edb8045f470 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -41,6 +41,7 @@
     parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
+    TEST_XPU,
     TestCase,
 )
 from torch.testing._internal.inductor_utils import HAS_GPU
@@ -57,6 +58,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 class TestFSDPUseOrigParamsMultipleParamGroups(FSDPTest):
     """Tests multiple parameter groups."""
@@ -158,7 +161,7 @@ def _get_fsdp_transformer_and_optim(
             device_init_mode == DEVICEInitMode.DEVICE_AFTER
             and not fsdp_model.cpu_offload.offload_params
         ):
-            fsdp_model = fsdp_model.cuda()
+            fsdp_model = fsdp_model.to(device=device_type)
         return fsdp_model, fsdp_optim
 
     def _check_train_parity(
@@ -171,7 +174,7 @@ def _check_train_parity(
         num_iters: int = 10,
     ):
         """Checks training parity between DDP and FSDP."""
-        device = torch.device("cuda")
+        device = torch.device(device_type)
         for i in range(num_iters):
             iter_losses = []
             for model, optim in ((ddp_model, ddp_optim), (fsdp_model, fsdp_optim)):
@@ -262,7 +265,7 @@ def _test_fsdp_compile(
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
         for _ in range(10):
             losses = []
-            inp = ref_model.get_input(torch.device("cuda"))
+            inp = ref_model.get_input(torch.device(device_type))
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
                 _optim.zero_grad()
                 loss = _model(*inp).sum()
@@ -470,7 +473,7 @@ def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy):
         ):
             ddp_optims.append(optim_ctor(ddp_param_group["params"]))
             fsdp_optims.append(optim_ctor(fsdp_param_group["params"]))
-        device = torch.device("cuda")
+        device = torch.device(device_type)
 
         # Check that there exists a `FlatParameter` that has both a weight and
         # a bias in this rank's shard
@@ -643,7 +646,7 @@ def _test_multiple_forward(
             fsdp_model_orig_params,
             optim_orig_params,
         ) = self._get_fsdp_models_and_optims(sharding_strategy, cpu_offload)
-        device = torch.device("cuda")
+        device = torch.device(device_type)
         for _ in range(3):
             inp1 = fsdp_model.get_input(device)
             _inp2 = fsdp_model.get_input(device)
@@ -701,7 +704,7 @@ def _test_summon_between_two_forwards(
             fsdp_model_orig_params,
             optim_orig_params,
         ) = self._get_fsdp_models_and_optims(sharding_strategy, cpu_offload)
-        device = torch.device("cuda")
+        device = torch.device(device_type)
         for _ in range(3):
             optim.zero_grad()
             optim_orig_params.zero_grad()
@@ -828,9 +831,9 @@ def check_parameter_parity(
                         p1 = p1.flatten()
                 torch.testing.assert_close(p1, p2)
 
-        ddp_model = DDP(Model().cuda(), device_ids=[self.rank])
+        ddp_model = DDP(Model().to(device=device_type), device_ids=[self.rank])
         fsdp_model = FSDP(
-            Model().cuda(),
+            Model().to(device=device_type),
             sharding_strategy=sharding_strategy,
             auto_wrap_policy=always_wrap_policy,
             use_orig_params=True,
@@ -838,7 +841,7 @@ def check_parameter_parity(
         LR = 1e-2
         ddp_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR)
         fsdp_optim = torch.optim.Adam(fsdp_model.parameters(), lr=LR)
-        device = torch.device("cuda")
+        device = torch.device(device_type)
 
         inp = fsdp_model.get_input(device)
         ddp_out = ddp_model(*inp)
@@ -913,11 +916,11 @@ def transform_param(param: nn.Parameter) -> nn.Parameter:
 
         # Check that the writeback propagates
         ddp_model = DDP(
-            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+            TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
             device_ids=[self.rank],
         )
         fsdp_model = FSDP(
-            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+            TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
             use_orig_params=True,
         )
         ddp = ddp_model.module  # for brevity
@@ -966,11 +969,11 @@ def transform_grad(param: nn.Parameter) -> nn.Parameter:
             return None if set_to_none else torch.ones_like(param) * 2
 
         ddp_model = DDP(
-            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+            TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
             device_ids=[self.rank],
         )
         fsdp_model = FSDP(
-            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+            TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
             use_orig_params=True,
         )
         LR = 1e-2
@@ -981,7 +984,7 @@ def transform_grad(param: nn.Parameter) -> nn.Parameter:
         fsdp_optim = torch.optim.Adam(fsdp_model.parameters(), lr=LR)
 
         # Generate an initial gradient
-        inp = fsdp_model.get_input(torch.device("cuda"))
+        inp = fsdp_model.get_input(torch.device(device_type))
         ddp_out = ddp_model(*inp)
         fsdp_out = fsdp_model(*inp)
         ddp_out.sum().backward()
@@ -1011,7 +1014,7 @@ def transform_grad(param: nn.Parameter) -> nn.Parameter:
         self._check_param_parity(ddp_model, fsdp_model)  # triggers a writeback
 
         # Intentionally do not zero the gradient to check writeback
-        inp = fsdp_model.get_input(torch.device("cuda"))
+        inp = fsdp_model.get_input(torch.device(device_type))
         ddp_out = ddp_model(*inp)
         fsdp_out = fsdp_model(*inp)
         ddp_out.sum().backward()
@@ -1023,7 +1026,7 @@ def transform_grad(param: nn.Parameter) -> nn.Parameter:
     @skip_if_lt_x_gpu(2)
     def test_writeback_shape_mismatch(self):
         fsdp_model = FSDP(
-            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+            TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
             use_orig_params=True,
         )
         # Check that writing back with mismatched shape errors
@@ -1073,9 +1076,9 @@ def test_writeback_between_fwd_and_bwd_for_no_reshard_raises(self):
         # Test changing the parameter storage to no longer be a view into the
         # flat parameter
         fsdp_model = fsdp_wrapper(
-            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda"))
+            TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type))
         )
-        inp = fsdp_model.get_input(torch.device("cuda"))
+        inp = fsdp_model.get_input(torch.device(device_type))
         loss = fsdp_model(*inp).sum()
         fsdp_model.lin1.weight.data = fsdp_model.lin1.weight.clone()
         assert_msg = (
@@ -1086,9 +1089,9 @@ def test_writeback_between_fwd_and_bwd_for_no_reshard_raises(self):
 
         # Test changing the parameter variable itself
         fsdp_model = fsdp_wrapper(
-            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda"))
+            TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type))
         )
-        inp = fsdp_model.get_input(torch.device("cuda"))
+        inp = fsdp_model.get_input(torch.device(device_type))
         loss = fsdp_model(*inp).sum()
         fsdp_model.lin1._fsdp_wrapped_module.weight = nn.Parameter(
             fsdp_model.lin1.weight.clone()
@@ -1122,9 +1125,10 @@ def _test_no_reshard_and_mixed_precision(self, use_full_prec_in_eval: bool):
 
         # Train forward -> full-precision unshard -> train forward
         fsdp_model = FSDP(
-            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")), **fsdp_kwargs
+            TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
+            **fsdp_kwargs,
         )
-        inp = fsdp_model.get_input(torch.device("cuda"))
+        inp = fsdp_model.get_input(torch.device(device_type))
         fsdp_model(*inp)
         with FSDP.summon_full_params(fsdp_model):
             ...
@@ -1183,13 +1187,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 assert_equal_fn(params[1].shape, param_shapes[1])
                 return self.lin(x)
 
-        model = Model().cuda()
+        model = Model().to(device=device_type)
         # Save the *unsharded* original parameter shapes and check the shapes
         # match in the forward pass
         param_shapes[0] = model.lin.weight.shape
         param_shapes[1] = model.lin.bias.shape
         fsdp_model = FSDP(model, use_orig_params=True)
-        inp = torch.randn((2, 5), device=torch.device("cuda"))
+        inp = torch.randn((2, 5), device=torch.device(device_type))
         fsdp_model(inp)
 
 
@@ -1216,7 +1220,7 @@ def test_no_sync_correctness(self):
         )
 
     def _test_no_sync_correctness(self, sharding_strategy: ShardingStrategy):
-        model = nn.Linear(7, 1, bias=False, device="cuda")
+        model = nn.Linear(7, 1, bias=False, device=device_type)
         fsdp_kwargs = {
             "sharding_strategy": sharding_strategy,
         }
@@ -1266,8 +1270,8 @@ def _check_param_grad_parity(
                     orig_param.grad,
                 )
 
-        inp = torch.randn((2, 7), device="cuda")
-        grad = torch.randn((2, 1), device="cuda")
+        inp = torch.randn((2, 7), device=device_type)
+        grad = torch.randn((2, 1), device=device_type)
 
         # Compute some reference gradients using one forward/backward
         out_use_flat_params = model_use_flat_params(inp)
@@ -1333,7 +1337,7 @@ def test_no_sync_mixed_precision(self):
         )
 
     def _test_no_sync_mixed_precision(self, sharding_strategy: ShardingStrategy):
-        model = nn.Linear(3, 3, device="cuda")
+        model = nn.Linear(3, 3, device=device_type)
         mixed_precision = MixedPrecision(
             param_dtype=torch.float16,
             reduce_dtype=torch.float32,
@@ -1344,7 +1348,7 @@ def _test_no_sync_mixed_precision(self, sharding_strategy: ShardingStrategy):
             "use_orig_params": True,
         }
         fsdp_model = FSDP(model, **fsdp_kwargs)
-        inp = torch.randn((2, 3), device="cuda")
+        inp = torch.randn((2, 3), device=device_type)
         with fsdp_model.no_sync():
             # For each of these `no_sync()` backward passes, check that the
             # gradients are in the low precision parameter dtype (FP16)
@@ -1368,8 +1372,8 @@ class TestFSDPUseOrigParamsInit(FSDPTest):
     @skip_if_lt_x_gpu(2)
     def test_non_uniform_requires_grad(self):
         model = nn.Sequential(
-            nn.Linear(3, 3, device="cuda"),
-            nn.Linear(3, 3, device="cuda"),
+            nn.Linear(3, 3, device=device_type),
+            nn.Linear(3, 3, device=device_type),
         )
         # Freeze biases only and flatten both weights and biases into the same
         # `FlatParameter` to exercise non-uniform `requires_grad`
@@ -1392,10 +1396,10 @@ def test_multi_tensor_apply_size0_tensors_cpu(self):
         # Check that this does not segfault
         torch._foreach_mul_(size0_tensors, 0.1)
 
-    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "no cuda and no xpu")
     def test_multi_tensor_apply_size0_tensors_cuda(self):
         size0_tensors = [
-            torch.empty(0, device="cuda") for _ in range(NUM_SIZE0_TENSORS)
+            torch.empty(0, device=device_type) for _ in range(NUM_SIZE0_TENSORS)
         ]
         # Check that this does not segfault
         torch._foreach_mul_(size0_tensors, 0.1)
diff --git a/test/distributed/fsdp/test_shard_utils.py b/test/distributed/fsdp/test_shard_utils.py
index c1a72a48218ac..7e1fb3816670b 100644
--- a/test/distributed/fsdp/test_shard_utils.py
+++ b/test/distributed/fsdp/test_shard_utils.py
@@ -15,6 +15,9 @@
 )
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 class TestShardUtilsDistributed(FSDPTest):
     @property
     def world_size(self):
@@ -23,7 +26,7 @@ def world_size(self):
     def _create_tensor(self, *size):
         # Keep everything deterministic.
         torch.manual_seed(0)
-        return torch.rand(*size).cuda()
+        return torch.rand(*size).to(device=device_type)
 
     @skip_if_lt_x_gpu(2)
     def test_create_chunk_sharded_tensor(self):
@@ -34,10 +37,12 @@ def test_create_chunk_sharded_tensor(self):
                 tensor,
                 self.rank,
                 self.world_size,
-                torch.cuda.device_count(),
+                torch.accelerator.device_count(),
                 _get_default_group(),
             )
-            output = torch.empty(*size).cuda() if self.rank == 0 else None
+            output = (
+                torch.empty(*size).to(device=device_type) if self.rank == 0 else None
+            )
             sharded_tensor.gather(0, output)
             if self.rank == 0:
                 self.assertEqual(tensor, output)
@@ -51,7 +56,7 @@ def world_size(self):
     def _create_tensor(self, *size):
         # Keep everything deterministic.
         torch.manual_seed(0)
-        return torch.rand(*size).cuda()
+        return torch.rand(*size).to(device=device_type)
 
     @with_comms
     @skip_if_lt_x_gpu(2)
diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py
index 603fa952073d4..f5769f8b534dd 100644
--- a/test/distributed/fsdp/test_wrap.py
+++ b/test/distributed/fsdp/test_wrap.py
@@ -50,10 +50,15 @@
     parametrize,
     run_tests,
     TEST_CUDA,
+    TEST_XPU,
     TestCase,
 )
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+backend = torch.distributed.get_default_backend_for_device(device_type)
+
+
 class BatchNormNet(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -132,14 +137,14 @@ def setUp(self) -> None:
 
     class NestedSequentialModel:
         @staticmethod
-        def get_model(cuda=True):
+        def get_model(device=True):
             sequential = nn.Sequential(
                 nn.Linear(5, 5),
                 nn.Linear(5, 5),
                 nn.Sequential(nn.Linear(5, 5), nn.Linear(5, 5)),
             )
-            if cuda:
-                sequential = sequential.cuda()
+            if device:
+                sequential = sequential.to(device=device_type)
             return sequential
 
         @staticmethod
@@ -214,7 +219,7 @@ def test_error_already_wrapped(self, nested, device_init_mode):
             nested=nested, device_init_mode=device_init_mode
         )
         if device_init_mode == DEVICEInitMode.DEVICE_AFTER:
-            wrapped_fsdp = wrapped_fsdp.cuda()
+            wrapped_fsdp = wrapped_fsdp.to(device=device_type)
 
         wrapped_module_name = "lin1.1" if nested else "lin1"
         with self.assertRaisesRegex(
@@ -369,7 +374,7 @@ def forward(self, input):
             forward_prefetch=forward_prefetch,
         )
         if device_init_mode == DEVICEInitMode.DEVICE_AFTER:
-            wrapped_model = wrapped_model.cuda()
+            wrapped_model = wrapped_model.to(device=device_type)
 
         modules_in_fsdp_graph_order = [
             wrapped_model.module.lin1,
@@ -388,7 +393,7 @@ def forward(self, input):
 
         # Run model a few times for sanity check.
         optim = torch.optim.SGD(wrapped_model.parameters(), lr=1e-2, momentum=0.9)
-        inp = torch.ones(1).cuda()
+        inp = torch.ones(1).to(device=device_type)
         for _ in range(6):
             optim.zero_grad()
             loss = wrapped_model(inp).sum()
@@ -461,13 +466,13 @@ def test_wrap_override_defaults(self):
         self.assertEqual(layer.rank, 0)
         self.assertEqual(layer.world_size, 2)
 
-    @unittest.skipIf(not TEST_CUDA, "Test Requires CUDA")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "Test Requires CUDA or XPU")
     def test_always_wrap(self):
         """
         Test to ensure that if `always_wrap_policy` is
         passed into FSDP, all submodules are wrapped.
         """
-        seq = TestFSDPWrap.NestedSequentialModel.get_model(cuda=True)
+        seq = TestFSDPWrap.NestedSequentialModel.get_model(device=True)
         model = FSDP(
             seq, process_group=self.process_group, auto_wrap_policy=always_wrap_policy
         )
@@ -629,7 +634,7 @@ def test_auto_wrap_api(self):
         Test to ensure with auto wrap, we wrap child modules correctly based on the min_num_params.
         ``nn.Linear(5, 5)`` does not exceed the bucket size, but combined they do.
         """
-        sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
+        sequential = TestFSDPWrap.NestedSequentialModel.get_model(device=False)
         my_auto_wrap_policy = functools.partial(
             size_based_auto_wrap_policy, min_num_params=40
         )
@@ -726,7 +731,7 @@ def test_auto_wrap_preset_force_leaf_custom(self):
         self.assertTrue(isinstance(model.module[0], nn.Linear))
         self.assertTrue(isinstance(model.module[1], nn.ModuleList))
 
-    @unittest.skipIf(not TEST_CUDA, "Test Requires CUDA")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "Test Requires CUDA or XPU")
     @parametrize(
         "device_init_mode", [DEVICEInitMode.DEVICE_BEFORE, DEVICEInitMode.DEVICE_AFTER]
     )
@@ -743,10 +748,12 @@ def test_auto_wrap_smoke_test(self, device_init_mode, cpu_offload, use_device_id
         ):
             return
 
-        device = torch.device("cuda")
-        torch.cuda.set_device(0)
+        device = torch.device(device_type)
+        torch.accelerator.set_device_index(0)
         device_id = (
-            torch.device("cuda", torch.cuda.current_device()) if use_device_id else None
+            torch.device(device_type, torch.accelerator.current_device_index())
+            if use_device_id
+            else None
         )
 
         # Random port in case the next test run quickly, same port would cause conflict.
@@ -755,18 +762,18 @@ def test_auto_wrap_smoke_test(self, device_init_mode, cpu_offload, use_device_id
 
         file_name = tempfile.NamedTemporaryFile(delete=False).name
         torch.distributed.init_process_group(
-            backend="nccl",
+            backend=backend,
             init_method=f"{FILE_SCHEMA}_{file_name}",
             rank=0,
             world_size=1,
         )
 
-        # NOTE: We move model to CUDA after init with FSDP to simulate real use
+        # NOTE: We move model to GPU after init with FSDP to simulate real use
         # cases where full model cannot be loaded onto GPU, but their shards can.
-        cuda_after_init = device_init_mode == DEVICEInitMode.DEVICE_AFTER
+        device_after_init = device_init_mode == DEVICEInitMode.DEVICE_AFTER
         try:
             sequential = TestFSDPWrap.NestedSequentialModel.get_model(
-                cuda=(not cuda_after_init)
+                device=(not device_after_init)
             )
             my_auto_wrap_policy = functools.partial(
                 size_based_auto_wrap_policy, min_num_params=40
@@ -778,8 +785,8 @@ def test_auto_wrap_smoke_test(self, device_init_mode, cpu_offload, use_device_id
                 device_id=device_id,
             )
             TestFSDPWrap.NestedSequentialModel.verify_model(self, model)
-            if cuda_after_init:
-                model = model.cuda()
+            if device_after_init:
+                model = model.to(device=device_type)
             input = torch.rand((1, 5), dtype=torch.float).to(device)
             output = model(input)
             loss = F.mse_loss(input, output)
@@ -795,7 +802,7 @@ def test_auto_wrap_smoke_test(self, device_init_mode, cpu_offload, use_device_id
     @unittest.skipIf(not TEST_MULTIGPU, "Requires at least 2 GPUs")
     @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API])
     def test_always_wrap_with_ignored_modules(self, wrap_method: WrapMethod):
-        sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
+        sequential = TestFSDPWrap.NestedSequentialModel.get_model(device=False)
         ignored_modules = [sequential[1], sequential[2][0]]
         fsdp_kwargs = {
             "process_group": self.process_group,
@@ -820,7 +827,7 @@ def test_always_wrap_with_ignored_modules(self, wrap_method: WrapMethod):
     @unittest.skipIf(not TEST_MULTIGPU, "Requires at least 2 GPUs")
     @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API])
     def test_auto_wrap_with_ignored_modules(self, wrap_method: WrapMethod):
-        sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
+        sequential = TestFSDPWrap.NestedSequentialModel.get_model(device=False)
         ignored_modules = [sequential[1], sequential[2][0]]
         my_auto_wrap_policy = functools.partial(
             size_based_auto_wrap_policy,
@@ -883,7 +890,7 @@ def lambda_fn_nonuniform(module: nn.Module):
             self._test_frozen_params(use_orig_params, policy)
 
     def _test_frozen_params(self, use_orig_params: bool, policy: _Policy):
-        model = LoraModel().cuda()
+        model = LoraModel().to(device=device_type)
         msg = "layers.0.attn has both parameters with requires_grad=True and False. "
         if use_orig_params:
             msg += "We do not recommend wrapping such modules"
diff --git a/test/distributed/launcher/test_api.py b/test/distributed/launcher/test_api.py
new file mode 100644
index 0000000000000..e6e778fe2ff32
--- /dev/null
+++ b/test/distributed/launcher/test_api.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# Owner(s): ["oncall: r2p"]
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from unittest.mock import MagicMock, patch
+
+from torch.distributed.launcher.api import launch_agent, LaunchConfig
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class LauncherApiTest(TestCase):
+    def setUp(self):
+        # Save original environment variable if it exists
+        self.original_signals_env = os.environ.get(
+            "TORCHELASTIC_SIGNALS_TO_HANDLE", None
+        )
+
+    def tearDown(self):
+        # Restore original environment variable
+        if self.original_signals_env is not None:
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = self.original_signals_env
+        elif "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+    @patch("torch.distributed.launcher.api.LocalElasticAgent")
+    @patch("torch.distributed.launcher.api.rdzv_registry.get_rendezvous_handler")
+    def test_launch_agent_sets_signals_env_var(self, mock_get_handler, mock_agent):
+        """Test that launch_agent sets the TORCHELASTIC_SIGNALS_TO_HANDLE environment variable."""
+        # Setup
+        config = LaunchConfig(
+            min_nodes=1,
+            max_nodes=1,
+            nproc_per_node=1,
+            signals_to_handle="SIGTERM,SIGUSR1,SIGUSR2",
+        )
+        entrypoint = "dummy_script.py"
+        args = []
+
+        # Make sure the environment variable doesn't exist before the test
+        if "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+        # Mock agent.run() to return a MagicMock
+        mock_agent_instance = MagicMock()
+        mock_agent_instance.run.return_value = MagicMock(
+            is_failed=lambda: False, return_values={}
+        )
+        mock_agent.return_value = mock_agent_instance
+
+        # Call launch_agent
+        launch_agent(config, entrypoint, args)
+
+        # Verify that the environment variable was set correctly
+        self.assertEqual(
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"], "SIGTERM,SIGUSR1,SIGUSR2"
+        )
+
+    @patch("torch.distributed.launcher.api.LocalElasticAgent")
+    @patch("torch.distributed.launcher.api.rdzv_registry.get_rendezvous_handler")
+    def test_launch_agent_default_signals(self, mock_get_handler, mock_agent):
+        """Test that launch_agent uses the default signals if not specified."""
+        # Setup
+        config = LaunchConfig(
+            min_nodes=1,
+            max_nodes=1,
+            nproc_per_node=1,
+            # Not specifying signals_to_handle, should use default
+        )
+        entrypoint = "dummy_script.py"
+        args = []
+
+        # Make sure the environment variable doesn't exist before the test
+        if "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+        # Mock agent.run() to return a MagicMock
+        mock_agent_instance = MagicMock()
+        mock_agent_instance.run.return_value = MagicMock(
+            is_failed=lambda: False, return_values={}
+        )
+        mock_agent.return_value = mock_agent_instance
+
+        # Call launch_agent
+        launch_agent(config, entrypoint, args)
+
+        # Verify that the environment variable was set to the default value
+        self.assertEqual(
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"],
+            "SIGTERM,SIGINT,SIGHUP,SIGQUIT",
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py
index f71bffd527c1e..d271e60954ae7 100644
--- a/test/distributed/launcher/test_run.py
+++ b/test/distributed/launcher/test_run.py
@@ -273,10 +273,29 @@ def test_nproc_launch_unknown_configurations(self):
     )
     @patch("torch.cuda.is_available", return_value=True)
     @patch("torch.cuda.device_count", return_value=3)
-    def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
+    @patch("torch.accelerator.is_available", return_value=True)
+    @patch("torch.accelerator.device_count", return_value=3)
+    @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="gpu"))
+    def test_nproc_gpu_launch_configurations(
+        self, _mock1, _mock2, _mock3, _mock4, _mock5
+    ):
         self._test_nproc_launch_configuration("auto", 3)
         self._test_nproc_launch_configuration("gpu", 3)
 
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    @patch("torch.xpu.is_available", return_value=True)
+    @patch("torch.xpu.device_count", return_value=3)
+    @patch("torch.accelerator.is_available", return_value=True)
+    @patch("torch.accelerator.device_count", return_value=3)
+    @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="xpu"))
+    def test_nproc_xpu_launch_configurations(
+        self, _mock1, _mock2, _mock3, _mock4, _mock5
+    ):
+        self._test_nproc_launch_configuration("auto", 3)
+        self._test_nproc_launch_configuration("xpu", 3)
+
     @skip_but_pass_in_sandcastle_if(
         TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
diff --git a/test/distributed/tensor/debug/test_debug_mode.py b/test/distributed/tensor/debug/test_debug_mode.py
new file mode 100644
index 0000000000000..c13edadb62c4f
--- /dev/null
+++ b/test/distributed/tensor/debug/test_debug_mode.py
@@ -0,0 +1,256 @@
+# Owner(s): ["oncall: distributed"]
+
+import contextlib
+
+import torch
+import torch.distributed as dist
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    requires_cuda,
+    run_tests,
+    TestCase,
+)
+from torch.testing._internal.distributed.fake_pg import FakeStore
+from torch.utils._debug_mode import DebugMode
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+@requires_cuda
+class TestDTensorDebugMode(TestCase):
+    def tearDown(self):
+        super().tearDown()
+        dist.destroy_process_group()
+
+    def setUp(self):
+        super().setUp()
+        self.world_size = 8
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake", rank=0, world_size=self.world_size, store=store
+        )
+        self.device_type = "cuda"
+
+    def test_debug_mode_mm(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        x = torch.randn(1, 8, requires_grad=False)
+        y = torch.randn(1, 32, requires_grad=True)
+        x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
+        y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
+
+        with DebugMode(record_torchfunction=True) as debug_mode:
+            torch.mm(x_dtensor, y_dtensor).sum()
+
+        self.assertExpectedInline(
+            debug_mode.debug_string(),
+            """\
+  torch.mm(dt: f32[8, 8][S(0)], dt: f32[8, 32][S(0)])
+    aten::mm(dt: f32[8, 8][S(0)], dt: f32[8, 32][S(0)])
+      redistribute_input(1, [S(0)] -> [R])
+        redistribute_input(t: f32[1, 32], [S(0)] -> [R])
+          _c10d_functional::all_gather_into_tensor(t: f32[1, 32], 8, 0)
+          _c10d_functional::wait_tensor(t: f32[8, 32])
+      aten::mm(t: f32[1, 8], t: f32[8, 32])
+  <method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 32][S(0)])
+    aten::sum(dt: f32[8, 32][S(0)])
+      aten::sum(t: f32[1, 32])""",
+        )
+
+    def test_debug_string_inside_context(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        x = torch.randn(1, 8, requires_grad=False)
+        y = torch.randn(1, 32, requires_grad=True)
+        x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
+        y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
+
+        with DebugMode() as debug_mode:
+            torch.mm(x_dtensor, y_dtensor).sum()
+            s0 = debug_mode.debug_string()
+        s1 = debug_mode.debug_string()
+        self.assertEqual(s0, s1)
+
+    def test_debug_mode_backward(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        x = torch.randn(1, 8, requires_grad=True)
+        y = torch.randn(8, 1, requires_grad=True)
+        x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
+        y_dtensor = DTensor.from_local(y, mesh, [Shard(1)], run_check=False)
+
+        with DebugMode(record_torchfunction=True) as debug_mode:
+            z = x_dtensor + y_dtensor
+            z.sum().backward()
+
+        self.assertExpectedInline(
+            debug_mode.debug_string(),
+            """\
+  <method 'add' of 'torch._C.TensorBase' objects>(dt: f32[8, 8][S(0)], dt: f32[8, 8][S(1)])
+    aten::add.Tensor(dt: f32[8, 8][S(0)], dt: f32[8, 8][S(1)])
+      redistribute_input(1, [S(1)] -> [S(0)])
+        redistribute_input(t: f32[8, 1], [S(1)] -> [S(0)])
+          _dtensor::shard_dim_alltoall(t: f32[8, 1], 1, 0, 0)
+      aten::add.Tensor(t: f32[1, 8], t: f32[1, 8])
+  <method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 8][S(0)])
+    aten::sum(dt: f32[8, 8][S(0)])
+      aten::sum(t: f32[1, 8])
+  torch._tensor.backward(dt: f32[][P], gradient=None, retain_graph=None, create_graph=False, inputs=None)
+    aten::ones_like(dt: f32[][P], pin_memory=False, memory_format=torch.preserve_format)
+      aten::ones_like(t: f32[], pin_memory=False, memory_format=torch.preserve_format)
+    aten::expand(dt: f32[][R], [8, 8])
+      aten::expand(t: f32[], [8, 8])
+      redistribute_input(t: f32[8, 8], [R] -> [S(1)])
+        aten::split.Tensor(t: f32[8, 8], 1, 1)
+        aten::clone(t: f32[8, 1])
+      aten::_to_copy(t: f32[8, 1], dtype=torch.float32, layout=torch.strided, device=cpu)
+      redistribute_input(t: f32[8, 8], [R] -> [S(0)])
+        aten::detach(t: f32[8, 1])
+        aten::split.Tensor(t: f32[8, 8], 1)
+        aten::clone(t: f32[1, 8])
+      aten::_to_copy(t: f32[1, 8], dtype=torch.float32, layout=torch.strided, device=cpu)
+      aten::detach(t: f32[1, 8])""",
+        )
+
+    def test_debug_mode_einsum(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).view(4, 2))
+
+        # Create test tensors
+        a = torch.randn(16, 6, 8)
+        b = torch.randn(8, 4, 4)
+
+        a_dt = DTensor.from_local(a, mesh, [Partial(), Replicate()], run_check=False)
+        b_dt = DTensor.from_local(b, mesh, [Replicate(), Partial()], run_check=False)
+
+        # Capture the operator decomposition
+        with DebugMode(record_torchfunction=True) as debug_mode:
+            torch.einsum("bld,dnh->blnh", a_dt, b_dt)
+
+        self.assertExpectedInline(
+            debug_mode.debug_string(),
+            """\
+  torch.functional.einsum(bld,dnh->blnh, dt: f32[16, 6, 8][P, R], dt: f32[8, 4, 4][R, P])
+    aten::unsqueeze(dt: f32[16, 6, 8][P, R], 3)
+      aten::unsqueeze(t: f32[16, 6, 8], 3)
+    aten::unsqueeze(dt: f32[16, 6, 8, 1][P, R], 4)
+      aten::unsqueeze(t: f32[16, 6, 8, 1], 4)
+    aten::permute(dt: f32[16, 6, 8, 1, 1][P, R], [0, 1, 3, 4, 2])
+      aten::permute(t: f32[16, 6, 8, 1, 1], [0, 1, 3, 4, 2])
+    aten::unsqueeze(dt: f32[8, 4, 4][R, P], 3)
+      aten::unsqueeze(t: f32[8, 4, 4], 3)
+    aten::unsqueeze(dt: f32[8, 4, 4, 1][R, P], 4)
+      aten::unsqueeze(t: f32[8, 4, 4, 1], 4)
+    aten::permute(dt: f32[8, 4, 4, 1, 1][R, P], [3, 4, 1, 2, 0])
+      aten::permute(t: f32[8, 4, 4, 1, 1], [3, 4, 1, 2, 0])
+    aten::permute(dt: f32[16, 6, 1, 1, 8][P, R], [0, 1, 4, 2, 3])
+      aten::permute(t: f32[16, 6, 1, 1, 8], [0, 1, 4, 2, 3])
+    aten::view(dt: f32[16, 6, 8, 1, 1][P, R], [1, 96, 8])
+      aten::view(t: f32[16, 6, 8, 1, 1], [1, 96, 8])
+    aten::permute(dt: f32[1, 1, 4, 4, 8][R, P], [4, 2, 3, 0, 1])
+      aten::permute(t: f32[1, 1, 4, 4, 8], [4, 2, 3, 0, 1])
+    aten::view(dt: f32[8, 4, 4, 1, 1][R, P], [1, 8, 16])
+      aten::view(t: f32[8, 4, 4, 1, 1], [1, 8, 16])
+    aten::bmm(dt: f32[1, 96, 8][P, R], dt: f32[1, 8, 16][R, P])
+      redistribute_input(0, [P, R] -> [S(2), S(2)])
+        redistribute_input(t: f32[1, 96, 8], [P, R] -> [S(2), S(2)])
+          aten::chunk(t: f32[1, 96, 8], 4, 2)
+          aten::cat(['t: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]'])
+          _c10d_functional::reduce_scatter_tensor(t: f32[4, 96, 2], sum, 4, 1)
+          _c10d_functional::wait_tensor(t: f32[1, 96, 2])
+          aten::chunk(t: f32[1, 96, 2], 2, 2)
+          aten::clone(t: f32[1, 96, 1])
+      redistribute_input(1, [R, P] -> [S(1), S(1)])
+        redistribute_input(t: f32[1, 8, 16], [R, P] -> [S(1), S(1)])
+          aten::chunk(t: f32[1, 8, 16], 4, 1)
+          aten::clone(t: f32[1, 2, 16])
+          aten::chunk(t: f32[1, 2, 16], 2, 1)
+          aten::cat(['t: f32[1, 1, 16]', 't: f32[1, 1, 16]'])
+          _c10d_functional::reduce_scatter_tensor(t: f32[2, 1, 16], sum, 2, 3)
+          _c10d_functional::wait_tensor(t: f32[1, 1, 16])
+      aten::bmm(t: f32[1, 96, 1], t: f32[1, 1, 16])
+    aten::view(dt: f32[1, 96, 16][P, P], [16, 6, 1, 4, 4])
+      aten::view(t: f32[1, 96, 16], [16, 6, 1, 4, 4])
+    aten::permute(dt: f32[16, 6, 1, 4, 4][P, P], [0, 1, 3, 4, 2])
+      aten::permute(t: f32[16, 6, 1, 4, 4], [0, 1, 3, 4, 2])
+    aten::view(dt: f32[16, 6, 4, 4, 1][P, P], [16, 6, 4, 4])
+      aten::view(t: f32[16, 6, 4, 4, 1], [16, 6, 4, 4])""",
+        )
+
+    def test_real_tensor(self):
+        x = torch.randn(8, 8, 8)
+        linear = torch.nn.Linear(8, 8)
+
+        with DebugMode(record_torchfunction=True) as debug_mode:
+            linear(x).sum()
+
+        self.assertExpectedInline(
+            debug_mode.debug_string(),
+            """\
+  torch._C._nn.linear(t: f32[8, 8, 8], t: f32[8, 8], t: f32[8])
+      aten::view(t: f32[8, 8, 8], [64, 8])
+      aten::t(t: f32[8, 8])
+      aten::addmm(t: f32[8], t: f32[64, 8], t: f32[8, 8])
+      aten::view(t: f32[64, 8], [8, 8, 8])
+  <method 'sum' of 'torch._C.TensorBase' objects>(t: f32[8, 8, 8])
+      aten::sum(t: f32[8, 8, 8])""",
+        )
+
+    def test_fake_tensor(self):
+        with FakeTensorMode():
+            x = torch.randn(8, 8)
+            y = torch.randn(8, 8, 8)
+
+        with DebugMode(record_torchfunction=True, record_faketensor=True) as debug_mode:
+            torch.matmul(y, x)
+
+        self.assertExpectedInline(
+            debug_mode.debug_string(),
+            """\
+  torch.matmul(ft: f32[8, 8, 8], ft: f32[8, 8])
+      aten::view(ft: f32[8, 8, 8], [64, 8])
+      aten::mm(ft: f32[64, 8], ft: f32[8, 8])
+      aten::_unsafe_view(ft: f32[64, 8], [8, 8, 8])""",
+        )
+
+    @parametrize("has_inner_mode", [True, False])
+    @parametrize("has_outer_mode", [True, False])
+    def test_nested_debug_mode(self, has_inner_mode, has_outer_mode):
+        class DummyTorchDispatchMode1(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                return func(*args, **kwargs)
+
+        class DummyTorchDispatchMode2(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                return func(*args, **kwargs)
+
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        x = torch.randn(1, 8, requires_grad=True)
+        y = torch.randn(1, 32, requires_grad=True)
+        x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
+        y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
+
+        inner_mode = (
+            DummyTorchDispatchMode1() if has_inner_mode else contextlib.nullcontext()
+        )
+        outer_mode = (
+            DummyTorchDispatchMode2() if has_outer_mode else contextlib.nullcontext()
+        )
+
+        with outer_mode:
+            with DebugMode() as debug_mode:
+                with inner_mode:
+                    torch.mm(x_dtensor, y_dtensor)
+
+        self.assertTrue(
+            "redistribute_input(1, [S(0)] -> [R])" in debug_mode.debug_string()
+        )
+
+
+instantiate_parametrized_tests(TestDTensorDebugMode)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
index a2543d443e4fe..cf9420b66e707 100644
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@@ -9,12 +9,11 @@
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from torch import nn, Tensor
+from torch import Tensor
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import DeviceMesh
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.experimental._attention import (
-    _AttentionContextParallel,
     _CausalBehavior,
     _cp_options,
     _DispatchMode,
@@ -24,10 +23,10 @@
     context_parallel_unshard,
     set_rotate_method,
 )
-from torch.distributed.tensor.parallel import parallelize_module
 from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.nn.attention.flex_attention import (
     _mask_mod_signature,
+    AuxRequest,
     create_block_mask,
     flex_attention,
 )
@@ -41,8 +40,6 @@
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
-    ModelArgs,
-    Transformer,
     with_comms,
 )
 
@@ -272,180 +269,6 @@ def test_is_causal_behavior(self) -> None:
                     behavior,
                 )
 
-    @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(
-        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
-    )
-    @with_comms
-    def test_ring_attention_native_transformer(self) -> None:
-        self.run_subtests(
-            {
-                "is_causal": [True, False],
-                "rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL],
-            },
-            self._test_ring_attention_native_transformer,
-        )
-
-    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
-    def _test_ring_attention_native_transformer(
-        self, is_causal: bool, rotater: _RotateMethod
-    ) -> None:
-        _cp_options.enable_load_balance = is_causal
-        set_rotate_method(rotater_enum_to_str[rotater])
-        self.assertEqual(_cp_options.rotate_method, rotater)
-        device_mesh = DeviceMesh(
-            self.device_type,
-            torch.arange(0, self.world_size),
-        )
-        dtype = torch.bfloat16
-        bs = 8
-        ntokens = 8
-        dim = 32
-        nheads = 8
-        num_layers = 2
-
-        encoder_layer = nn.TransformerEncoderLayer(
-            d_model=dim,
-            nhead=nheads,
-            dim_feedforward=dim,
-            batch_first=True,
-        ).to(dtype)
-        encoder_layer = parallelize_module(
-            module=encoder_layer,
-            device_mesh=device_mesh,
-            parallelize_plan={
-                "self_attn": _AttentionContextParallel(),
-            },
-        )
-        model = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
-        model = model.to(self.device_type).to(dtype)
-
-        mask = (
-            nn.Transformer.generate_square_subsequent_mask(
-                ntokens, device=self.device_type, dtype=dtype
-            )
-            if is_causal
-            else None
-        )
-        seq = torch.rand((bs, ntokens, dim), device=self.device_type, dtype=dtype)
-
-        with CommDebugMode() as comm_mode:
-            out = model(seq, mask=mask, is_causal=is_causal)
-
-        if rotater == _RotateMethod.ALL_TO_ALL:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_to_all_single: (self.world_size - 1)
-                    * num_layers,
-                },
-            )
-        else:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_gather_into_tensor: num_layers,
-                },
-            )
-
-        with CommDebugMode() as comm_mode:
-            out.sum().backward()
-
-        if rotater == _RotateMethod.ALL_TO_ALL:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
-                    * num_layers,
-                },
-            )
-        else:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_gather_into_tensor: num_layers,
-                    c10d_functional.all_to_all_single: self.world_size * num_layers,
-                },
-            )
-
-    @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(
-        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
-    )
-    @with_comms
-    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
-    def test_ring_attention_custom_transformer(self) -> None:
-        self.run_subtests(
-            {"rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL]},
-            self._test_ring_attention_custom_transformer,
-        )
-
-    def _test_ring_attention_custom_transformer(self, rotater: _RotateMethod) -> None:
-        set_rotate_method(rotater_enum_to_str[rotater])
-        self.assertEqual(_cp_options.rotate_method, rotater)
-        device_mesh = DeviceMesh(
-            self.device_type,
-            torch.arange(0, self.world_size),
-        )
-        # early init DTensor RNG tracker to avoid broadcast be captuured in comm_mode
-        torch.distributed.tensor._random.manual_seed(10, device_mesh)
-
-        dtype = torch.bfloat16
-        bs = 2
-        args = ModelArgs()
-
-        model = Transformer(args).to(dtype).to(self.device_type)
-
-        model = parallelize_module(
-            module=model,
-            device_mesh=device_mesh,
-            parallelize_plan={
-                f"layers.{i}.attention": _AttentionContextParallel()
-                for i in range(args.n_layers)
-            },
-        )
-
-        seq = torch.randint(
-            args.vocab_size, (bs, args.max_seq_len), device=self.device_type
-        )
-
-        with CommDebugMode() as comm_mode:
-            out = model(seq)
-
-        if rotater == _RotateMethod.ALL_TO_ALL:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_to_all_single: (self.world_size - 1)
-                    * args.n_layers,
-                },
-            )
-        else:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {c10d_functional.all_gather_into_tensor: args.n_layers},
-            )
-
-        with CommDebugMode() as comm_mode:
-            out.sum().backward()
-
-        if rotater == _RotateMethod.ALL_TO_ALL:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
-                    * args.n_layers,
-                },
-            )
-        else:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_gather_into_tensor: args.n_layers,
-                    c10d_functional.all_to_all_single: self.world_size * args.n_layers,
-                },
-            )
-
 
 # Compile the flex_attention function
 compiled_flex_attention = torch.compile(flex_attention, dynamic=False, fullgraph=True)
@@ -531,12 +354,12 @@ def doc_mask_mod(b, h, q_idx, kv_idx):
     return doc_mask_mod
 
 
-class RingFlexAttentionTest(DTensorTestBase):
+class CPFlexAttentionTest(DTensorTestBase):
     @property
     def world_size(self) -> int:
         return 2
 
-    def _test_ring_flex_attention(
+    def _test_cp_flex_attention(
         self, qkv_size, B=1, mask_func=causal_mask, atol=1e-6, rtol=1e-2
     ) -> None:
         torch.cuda.manual_seed(10)
@@ -574,8 +397,8 @@ def _test_ring_flex_attention(
             device=self.device_type,
         )
 
-        expect_out, expect_lse = compiled_flex_attention(
-            q, k, v, block_mask=block_mask, return_lse=True
+        expect_out, expect_aux = compiled_flex_attention(
+            q, k, v, block_mask=block_mask, return_aux=AuxRequest(lse=True)
         )
         expect_out.sum().backward()
 
@@ -585,15 +408,6 @@ def _test_ring_flex_attention(
             mesh_shape=(self.world_size,),
             mesh_dim_names=("cp",),
         )
-        # NOTE: cp needs to know the sharding dimension
-        # TODO: see if this can be moved to the cp context
-        from torch.distributed.tensor.experimental._attention import _set_cp_global_var
-
-        _set_cp_global_var("cp_shard_dim", 2)
-        self.assertEqual(
-            torch.distributed.tensor.experimental._attention._cp_global_vars.cp_shard_dim,
-            2,
-        )
 
         # NOTE: we do not test load balance here
         _cp_options.enable_load_balance = False
@@ -635,12 +449,12 @@ def _test_ring_flex_attention(
             cp_k.requires_grad = True
             cp_v.requires_grad = True
 
-            cp_out, cp_lse = compiled_flex_attention(
+            cp_out, cp_aux = compiled_flex_attention(
                 cp_q,
                 cp_k,
                 cp_v,
                 block_mask=cp_block_mask,
-                return_lse=True,
+                return_aux=AuxRequest(lse=True),
             )
 
             # check block_mask rewrite doesn't escape to the outside
@@ -657,9 +471,11 @@ def _test_ring_flex_attention(
             cp_v.requires_grad = False
 
         # unshard the output
-        cp_out, cp_lse = context_parallel_unshard(device_mesh, [cp_out, cp_lse], [2, 2])
+        cp_out, cp_lse = context_parallel_unshard(
+            device_mesh, [cp_out, cp_aux.lse], [2, 2]
+        )
         torch.testing.assert_close(cp_out, expect_out, atol=atol, rtol=rtol)
-        torch.testing.assert_close(cp_lse, expect_lse, atol=atol, rtol=rtol)
+        torch.testing.assert_close(cp_lse, expect_aux.lse, atol=atol, rtol=rtol)
 
         # unshard the gradient
         cp_q_grad, cp_k_grad, cp_v_grad = context_parallel_unshard(
@@ -681,17 +497,17 @@ def _test_ring_flex_attention(
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
     )
-    def test_ring_flex_attention(self) -> None:
+    def test_cp_flex_attention(self) -> None:
         self.run_subtests(
             {"qkv_size": [128 * self.world_size, 2048]},
-            self._test_ring_flex_attention,
+            self._test_cp_flex_attention,
         )
 
         # NOTE: Context Parallel should not be used for small attentions (block_size < 128)
         with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close"):
             self.run_subtests(
                 {"qkv_size": [64 * self.world_size]},
-                self._test_ring_flex_attention,
+                self._test_cp_flex_attention,
             )
 
     # TODO: merge with the above test
@@ -700,7 +516,7 @@ def test_ring_flex_attention(self) -> None:
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
     )
-    def test_ring_flex_attention_document_mask(self) -> None:
+    def test_cp_flex_attention_document_mask(self) -> None:
         random.seed(10)
 
         # NOTE: Each (batch_size, seq_len) tuple introduces 2 create_block_mask
@@ -721,7 +537,7 @@ def test_ring_flex_attention_document_mask(self) -> None:
 
         # TODO: change this for-loop to run_subtests
         # Use a for-loop instead of run_subtests because we need to intialize the mask
-        # for each subtest. This can be baked into self._test_ring_flex_attention as
+        # for each subtest. This can be baked into self._test_cp_flex_attention as
         # a str argument denoting mask type.
         for batch_size, max_seq_len in itertools.product(
             batch_size_list, max_seq_len_list
@@ -735,7 +551,7 @@ def test_ring_flex_attention_document_mask(self) -> None:
 
             # construct testing function
             test_func = functools.partial(
-                self._test_ring_flex_attention,
+                self._test_cp_flex_attention,
                 qkv_size=max_seq_len,
                 B=batch_size,
                 mask_func=document_causal_mask,
diff --git a/test/distributed/tensor/test_dtensor.py b/test/distributed/tensor/test_dtensor.py
index f5ddb1a4222c6..083f6d459c7e0 100644
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@@ -1,7 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
-import os
 import pathlib
 import tempfile
 import unittest
@@ -33,7 +32,6 @@
     DTensorTestBase,
     with_comms,
 )
-from torch.testing._internal.logging_utils import LoggingTestCase
 
 
 c10d_functional = torch.ops.c10d_functional
@@ -1012,36 +1010,5 @@ def test_split_tensor_1D(self) -> None:
                 assert_array_equal(expected_is_tensor_empty, is_tensor_empty)
 
 
-class DTensorLogTest(LoggingTestCase):
-    def test_dtensor_log(self):
-        if not torch.distributed.is_available() or not torch.cuda.is_available():
-            return
-
-        env = dict(os.environ)
-        env["TORCH_LOGS"] = "+dtensor"
-        env["RANK"] = "0"
-        env["WORLD_SIZE"] = "1"
-        env["MASTER_PORT"] = "12345"
-        env["MASTER_ADDR"] = "localhost"
-
-        _, stderr = self.run_process_no_exception(
-            """\
-import logging
-import torch
-from torch.distributed.device_mesh import init_device_mesh
-from torch.distributed.tensor import distribute_tensor, Shard
-
-mesh = init_device_mesh("cuda", (1,), mesh_dim_names=("dp",))
-placements = [Shard(0)]
-tensor = torch.randn(12, 8, 8)
-dtensor = distribute_tensor(tensor, mesh, placements)
-dtensor.max()
-""",
-            env=env,
-        )
-        self.assertIn("_dispatch.py", stderr.decode("utf-8"))
-        self.assertIn("redistribute=False", stderr.decode("utf-8"))
-
-
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
index 15e3daf6b9413..4095c621eed58 100644
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@@ -183,7 +183,7 @@ def forward(self, x):
         )
         torch.utils._pytree.register_constant(DeviceMesh)
 
-        ep = torch.export.export_for_training(
+        ep = torch.export.export(
             Foo(), (torch.randn(4, 4, dtype=torch.float64),), strict=False
         )
         self.assertExpectedInline(
@@ -211,8 +211,8 @@ def forward(self, b_parametrizations_buffer_original0, x):
     _assert_tensor_metadata = torch.ops.aten._assert_tensor_metadata.default(x, None, None, torch.float64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata = None
     _to_copy = torch.ops.aten._to_copy.default(x, dtype = torch.float64, layout = torch.strided, device = device(type='cuda', index=0));  x = None
     view = torch.ops.aten.view.default(_to_copy, [4, 4]);  _to_copy = None
-    add_1 = torch.ops.aten.add.Tensor(b_parametrizations_buffer_original0, view);  b_parametrizations_buffer_original0 = view = None
-    view_1 = torch.ops.aten.view.default(add_1, [4, 4]);  add_1 = None
+    add = torch.ops.aten.add.Tensor(b_parametrizations_buffer_original0, view);  b_parametrizations_buffer_original0 = view = None
+    view_1 = torch.ops.aten.view.default(add, [4, 4]);  add = None
     return (view_1,)""",  # noqa: B950
         )
 
@@ -317,6 +317,9 @@ def fn(x):
         self.assertEqual(res, ref)
 
     @skipIfHpu
+    @unittest.skip(
+        "DTensor + dynamic fails - s77 + 8 is not tracked with proxy .. proxy_tensor.PythonKeyTracer"
+    )
     def test_dtensor_dynamic_slice(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -358,6 +361,9 @@ def fn(x):
             res = opt_fn(x)
         self.assertEqual(res, ref)
 
+    @unittest.skip(
+        "DTensor + dynamic fails - s77 + 8 is not tracked with proxy .. proxy_tensor.PythonKeyTracer"
+    )
     def test_dtensor_dynamic_cat(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
index 8c650f6b0ce02..ad37f696cea8d 100644
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@@ -7,7 +7,7 @@
 import torch
 import torch.distributed as dist
 import torch.testing._internal.common_methods_invocations as common_ops
-from torch.distributed.tensor import DeviceMesh, DTensor
+from torch.distributed.tensor import distribute_tensor, DTensor, init_device_mesh, Shard
 from torch.overrides import resolve_name
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
@@ -20,6 +20,7 @@
     DTensorOpTestBase,
 )
 from torch.utils import _pytree as pytree
+from torch.utils._debug_mode import DebugMode
 from torch.utils._pytree import tree_map
 
 
@@ -117,7 +118,6 @@ def wrapped(fn):
     xfail("cholesky"),
     xfail("cholesky_inverse"),
     xfail("cholesky_solve"),
-    xfail("chunk"),
     xfail("combinations"),
     xfail("complex"),
     xfail("count_nonzero"),
@@ -126,10 +126,6 @@ def wrapped(fn):
     xfail("cummin"),
     xfail("diagonal_scatter"),
     xfail("dist"),
-    xfail("empty"),
-    xfail("empty_strided"),
-    xfail("empty_like"),
-    xfail("empty_permuted"),
     xfail("expand_copy"),
     xfail("exponential"),
     xfail("equal"),
@@ -163,7 +159,6 @@ def wrapped(fn):
     xfail("geometric"),
     xfail("geqrf"),
     xfail("grid_sampler_2d"),
-    xfail("gradient"),
     xfail("heaviside"),
     xfail("histogram"),
     xfail("histogramdd"),
@@ -198,7 +193,6 @@ def wrapped(fn):
     xfail("linalg.lu_factor_ex"),
     xfail("linalg.lu_solve"),
     xfail("linalg.matrix_power"),
-    xfail("linalg.multi_dot"),
     xfail("linalg.pinv"),
     xfail("linalg.pinv", "hermitian"),
     xfail("linalg.slogdet"),
@@ -422,8 +416,6 @@ def wrapped(fn):
     xfail("tensor_split"),
     xfail("to_sparse"),
     xfail("trace"),
-    xfail("trapezoid"),
-    xfail("trapz"),
     xfail("triangular_solve"),
     xfail("unbind"),
     xfail("unbind_copy"),
@@ -482,6 +474,11 @@ def wrapped(fn):
     skip("_segment_reduce", "offsets"),
     # TODO: fix the following ops
     skip("squeeze"),
+    # These must be skipped as their contents are nondeterministic
+    skip("empty"),
+    skip("empty_strided"),
+    skip("empty_like"),
+    skip("empty_permuted"),
 }
 
 
@@ -510,7 +507,7 @@ def world_size(self) -> int:
     def run_opinfo_test(
         self, dtype, op, requires_grad=True, sample_inputs_filter=lambda s: True
     ):
-        self.mesh = DeviceMesh(DEVICE_TYPE, torch.arange(self.world_size))
+        self.mesh = init_device_mesh(DEVICE_TYPE, (self.world_size,))
 
         # test each op with dist tensor inputs and normal inputs
         def test():
@@ -635,7 +632,7 @@ def to_replicate(e: object) -> object:
                         )
                 except Exception as e:
                     raise RuntimeError(
-                        f"failed to run: {resolve_name(func)}, with (*{dtensor_args}, **{dtensor_kwargs})"
+                        f"{str(e)}\n\nfailed to run: {resolve_name(func)}, with (*{dtensor_args}, **{dtensor_kwargs})"
                     ) from e
         return rs
 
@@ -663,6 +660,36 @@ def test_one_hot(self):
             sample_inputs_filter=lambda s: s.kwargs["num_classes"] != -1,
         )
 
+    def test_mean(self):
+        self.mesh = init_device_mesh(DEVICE_TYPE, (self.world_size,))
+
+        shape = [2 * self.world_size + 1, 2 * self.world_size]
+        tensor = (
+            torch.arange(shape[0] * shape[1], dtype=torch.float32)
+            .reshape(shape)
+            .to(DEVICE_TYPE)
+        )
+
+        for is_evenly_shardable in [True, False]:
+            if is_evenly_shardable:
+                placement = [Shard(1)]
+                reduce_dim = 1
+            else:
+                placement = [Shard(0)]
+                reduce_dim = 0
+            dtensor = distribute_tensor(tensor, self.mesh, placement)
+
+            with DebugMode(record_torchfunction=False) as debug_mode:
+                mean = dtensor.mean(dim=reduce_dim)
+                full_tensor = mean.full_tensor()
+
+            self.assertEqual(full_tensor, tensor.mean(dim=reduce_dim))
+
+            if is_evenly_shardable:
+                self.assertTrue("[P] -> [R]" in debug_mode.debug_string())
+            else:
+                self.assertTrue("[S(0)] -> [R])" in debug_mode.debug_string())
+
 
 # only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
 instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,))
diff --git a/test/distributed/tensor/test_op_schema.py b/test/distributed/tensor/test_op_schema.py
index ae6aa3dbc9915..eada226c69049 100644
--- a/test/distributed/tensor/test_op_schema.py
+++ b/test/distributed/tensor/test_op_schema.py
@@ -1,8 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+import random
+
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
-from torch.distributed.tensor._op_schema import OpSchema
+from torch.distributed.tensor._op_schema import OpSchema, RuntimeSchemaInfo
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
@@ -10,12 +12,108 @@ class TestOpSchema(TestCase):
     def test_equality_checks_lists_of_dtensor_spec(self):
         """If x == y, then we must have h(x) == h(y)."""
         dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
-        schema1 = OpSchema(op=None, args_schema=[dts, [dts]], kwargs_schema={})
-        schema2 = OpSchema(op=None, args_schema=[dts, [dts, dts]], kwargs_schema={})
+        schema1 = OpSchema(op=None, args_schema=(dts, [dts]), kwargs_schema={})
+        schema2 = OpSchema(op=None, args_schema=(dts, [dts, dts]), kwargs_schema={})
         # This is a regression test; these schemas used to compare equal.
         self.assertNotEqual(schema1, schema2)
         self.assertNotEqual(hash(schema1), hash(schema2))
 
+    def test_equality_respects_static_attributes(self):
+        def _get_sample_op_schemas(static_arg_val, static_kwarg_val):
+            dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
+            static_argnum = 2
+            static_kwargkey = ["statickwarg"]
+            annotated_schemas = [
+                (False, False, None),
+                (True, False, RuntimeSchemaInfo(static_argnum=static_argnum)),
+                (False, True, RuntimeSchemaInfo(static_kwargkey=static_kwargkey)),
+                (
+                    True,
+                    True,
+                    RuntimeSchemaInfo(
+                        static_argnum=static_argnum, static_kwargkey=static_kwargkey
+                    ),
+                ),
+            ]
+
+            # non-tensor args show up in hash iff the argnum is static/
+            # kwargs show up in hash iff their name is in static_kwargkey.
+            # random elements are random because they are not supposed to matter for
+            # equality at all.
+            args_schema = (dts, random.randint(1, 1000000), static_arg_val)
+            kwargs_schema = {
+                "ignoredkwarg": random.randint(1, 1000000),
+                "statickwarg": static_kwarg_val,
+            }
+            return [
+                (
+                    has_static_arg,
+                    has_static_kwarg,
+                    OpSchema(
+                        op=None,
+                        args_schema=args_schema,
+                        kwargs_schema=kwargs_schema,
+                        schema_info=si,
+                    ),
+                )
+                for (has_static_arg, has_static_kwarg, si) in annotated_schemas
+            ]
+
+        for lhs_has_static_arg, lhs_has_static_kwarg, lhs in _get_sample_op_schemas(
+            1, 2
+        ):
+            # Static arg/kwarg both match
+            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
+                1, 2
+            ):
+                if (
+                    lhs_has_static_arg == rhs_has_static_arg
+                    and lhs_has_static_kwarg == rhs_has_static_kwarg
+                ):
+                    self.assertEqual(lhs, rhs)
+                else:
+                    self.assertNotEqual(lhs, rhs)
+
+            # Static arg mismatch
+            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
+                3, 2
+            ):
+                if (
+                    lhs_has_static_arg
+                    or rhs_has_static_arg
+                    or lhs_has_static_kwarg != rhs_has_static_kwarg
+                ):
+                    self.assertNotEqual(lhs, rhs)
+                else:
+                    self.assertEqual(lhs, rhs)
+
+            # Static kwarg mismatch
+            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
+                1, 3
+            ):
+                if (
+                    lhs_has_static_kwarg
+                    or rhs_has_static_kwarg
+                    or lhs_has_static_arg != rhs_has_static_arg
+                ):
+                    self.assertNotEqual(lhs, rhs)
+                else:
+                    self.assertEqual(lhs, rhs)
+
+            # Static arg/kwarg both mismatch
+            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
+                3, 4
+            ):
+                if (
+                    lhs_has_static_arg
+                    or rhs_has_static_arg
+                    or lhs_has_static_kwarg
+                    or rhs_has_static_kwarg
+                ):
+                    self.assertNotEqual(lhs, rhs)
+                else:
+                    self.assertEqual(lhs, rhs)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py
index 0e75748be8a31..eaa1969068c1f 100644
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@@ -1,6 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+import itertools
+
 import torch
 from torch.distributed.tensor import (
     DeviceMesh,
@@ -93,6 +95,19 @@ def test_copy_(self):
             dst_tensor.copy_(src_tensor)
             self.assertEqual(dst_dtensor.full_tensor(), dst_tensor)
 
+        # as a pointwise op, need to keep Partial placements without redistribute
+        src_tensor = torch.randn((64, 1))
+        dst_tensor = torch.zeros(16, 32, 64, 128)
+        src_specs = [[Partial()]]
+        dst_specs = [[Partial()]]
+        for dst_spec, src_spec in zip(dst_specs, src_specs):
+            src_dtensor = DTensor.from_local(src_tensor, device_mesh, src_spec)
+            dst_dtensor = DTensor.from_local(dst_tensor, device_mesh, dst_spec)
+            dst_dtensor.copy_(src_dtensor)
+            dst_tensor.copy_(src_tensor)
+            self.assertEqual(dst_dtensor.placements, (Partial(),))
+            self.assertEqual(dst_dtensor._local_tensor, dst_tensor)
+
     @with_comms
     def test_contiguous(self):
         device_mesh = self.build_device_mesh()
@@ -776,6 +791,36 @@ def _test_split_on_partial(self, reduce_op: str, split_size: int, split_dim: int
             dim=split_dim,
         )
 
+    @with_comms
+    def test_unbind(self):
+        device_mesh = self.build_device_mesh()
+        shard_dims = [0, 1]
+        unbind_dims = [0, 1]
+        local_tensor = torch.randn(4, 8, requires_grad=True)
+        for shard_dim, unbind_dim in itertools.product(shard_dims, unbind_dims):
+            dist_tensor = distribute_tensor(
+                local_tensor, device_mesh, (Shard(shard_dim),)
+            )
+
+            if shard_dim == unbind_dim:
+                with self.assertRaisesRegex(
+                    RuntimeError, "Sharding propagation failed"
+                ):
+                    dist_tensor.unbind(dim=unbind_dim)
+            else:
+                unbinded_dist_tensors = dist_tensor.unbind(dim=unbind_dim)
+                new_shard_dim = shard_dim if shard_dim < unbind_dim else shard_dim - 1
+                self.assertTrue(
+                    all(
+                        elem.placements[0].is_shard(dim=new_shard_dim)
+                        for elem in unbinded_dist_tensors
+                    )
+                )
+                for x, y in zip(
+                    unbinded_dist_tensors, local_tensor.unbind(dim=unbind_dim)
+                ):
+                    self.assertEqual(x.full_tensor(), y)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_utils.py b/test/distributed/tensor/test_utils.py
index dbfbac12223bb..543e78a3237cd 100644
--- a/test/distributed/tensor/test_utils.py
+++ b/test/distributed/tensor/test_utils.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
 import itertools
+from typing import Any
 
 import torch
 from torch.distributed.device_mesh import init_device_mesh
@@ -9,11 +10,18 @@
 from torch.distributed.tensor._utils import (
     _compute_local_shape_and_global_offset,
     _explicit_order_placements,
+    compute_global_tensor_info,
     compute_global_tensor_shape,
     compute_local_shape_and_global_offset,
 )
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.distributed.tensor.placement_types import _StridedShard, Replicate, Shard
+from torch.distributed.tensor.placement_types import (
+    _StridedShard,
+    Partial,
+    Placement,
+    Replicate,
+    Shard,
+)
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -442,6 +450,51 @@ def test_strided_sharding_assumption_in_meta_compute(self):
             )
 
 
+class UtilSingleDeviceTest(TestCase):
+    def test_compute_global_tensor_info_unsupported_placement(self):
+        class MockDeviceMesh:
+            def size(self, x):
+                return x
+
+        class FakePlacement(Placement):
+            pass
+
+        device_mesh: Any = MockDeviceMesh()
+        local_tensor = torch.tensor([1])
+        with self.assertRaises(RuntimeError):
+            compute_global_tensor_info(local_tensor, device_mesh, [FakePlacement()])
+
+    def test_compute_global_tensor_info_non_shard_placements(self):
+        class MockDeviceMesh:
+            def size(self, x):
+                return x
+
+        device_mesh: Any = MockDeviceMesh()
+        local_tensor = torch.tensor([[1], [2]])
+        global_size, global_stride = compute_global_tensor_info(
+            local_tensor, device_mesh, [Replicate(), Partial()]
+        )
+        self.assertEqual(global_size, local_tensor.size())
+        self.assertEqual(global_stride, local_tensor.stride())
+
+    def test_compute_global_tensor_info_shard_placement(self):
+        class MockDeviceMesh:
+            def size(self, dim):
+                return dim + 2
+
+        device_mesh: Any = MockDeviceMesh()
+        local_tensor = torch.tensor([[[1], [2], [3]], [[4], [5], [6]]])
+        global_size, global_stride = compute_global_tensor_info(
+            local_tensor, device_mesh, [Shard(0), Shard(1), Shard(2)]
+        )
+        self.assertEqual(
+            global_size, [(i + 2) * x for (i, x) in enumerate(local_tensor.size())]
+        )
+        self.assertEqual(global_stride[0], local_tensor.stride()[0] * 3 * 4)
+        self.assertEqual(global_stride[1], local_tensor.stride()[1])
+        self.assertEqual(global_stride[2], local_tensor.stride()[2] * 3)
+
+
 class TestStridedSharding(DTensorTestBase):
     @property
     def world_size(self):
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 1857feffd9394..89afc369fe149 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -43,6 +43,7 @@
     retry_on_connect_failures,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
+    TEST_XPU,
     TestCase,
 )
 from torch.utils.checkpoint import checkpoint
@@ -63,6 +64,8 @@
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 def gpus_for_rank(world_size):
     """Multigpu tests are designed to simulate the multi nodes with multi
@@ -70,8 +73,9 @@ def gpus_for_rank(world_size):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    visible_devices = list(range(torch.cuda.device_count()))
-    gpus_per_process = torch.cuda.device_count() // world_size
+    device_count = torch.accelerator.device_count()
+    visible_devices = list(range(device_count))
+    gpus_per_process = device_count // world_size
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
@@ -401,7 +405,7 @@ def _prepare_multi_device_module(
             gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
-        input = torch.randn(global_batch_size, 2).cuda(devices[0])
+        input = torch.randn(global_batch_size, 2).to(devices[0])
         target = torch.randn(global_batch_size, 4)
 
         return model, ddp_model, input, target
@@ -435,10 +439,10 @@ def _test_ddp_checkpointing(
         allow_none_grads=False,
     ):
         # to reproduce the same training results
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         torch.manual_seed(31415)
-        model = copy.deepcopy(input_model).cuda()
-        ddp_model = copy.deepcopy(input_model).cuda()
+        model = copy.deepcopy(input_model).to(device_type)
+        ddp_model = copy.deepcopy(input_model).to(device_type)
         ddp_model = nn.parallel.DistributedDataParallel(
             ddp_model,
             bucket_cap_mb=1,
@@ -554,8 +558,8 @@ def __init__(self, use_reentrant=True):
     def _prepare_dummy_data(self):
         ddp_bs = 16
         bs = ddp_bs * self.world_size
-        input = torch.rand((bs, 20), device="cuda", requires_grad=True)
-        target = torch.randn((bs, 20), device="cuda")
+        input = torch.rand((bs, 20), device=device_type, requires_grad=True)
+        target = torch.randn((bs, 20), device=device_type)
         offset = self.rank * ddp_bs
         ddp_input = input[offset : offset + ddp_bs]
         ddp_target = target[offset : offset + ddp_bs]
@@ -715,7 +719,7 @@ def test_ddp_checkpointing_weight_sharing(self, use_reentrant):
         Test that checkpointing with weight sharing works.
         """
         process_group = self._get_process_group()
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         for use_bucket_view, static_graph in product((False, True), (False, True)):
             torch.manual_seed(31415)
             l1 = nn.Linear(20, 20)
@@ -738,7 +742,7 @@ def test_ddp_checkpointing_twice_weight_sharing(self):
         same layer twice and having weights shared across layers.
         """
         process_group = self._get_process_group()
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         for use_bucket_view in (True, False):
             self._test_ddp_checkpointing(
                 self.CheckpointTwiceModuleWeightSharing(),
@@ -1162,7 +1166,7 @@ def _test_sequence_num_incremented(self, process_group, ranks):
 
         # Verify sequence numbers are appropriately incremented
         for i in range(10):
-            t = torch.ones(1, device=torch.cuda.current_device())
+            t = torch.ones(1, device=device_type)
             dist.all_reduce(t, group=process_group)
             if not c10d._rank_not_in_group(process_group):
                 seq_num = self._verify_sequence_number_across_pg(
@@ -1193,7 +1197,7 @@ def _test_sequence_num_incremented(self, process_group, ranks):
                 self.assertEqual(rank_to_seq_num[0] + 1, rank_to_seq_num[1])
 
     def _test_sequence_num_incremented_default_group(self, backend_name):
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
             backend_name,
@@ -1207,7 +1211,7 @@ def _test_sequence_num_incremented_default_group(self, backend_name):
         )
 
     def _test_sequence_num_incremented_subgroup(self, backend_name):
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
             backend_name,
@@ -1262,8 +1266,8 @@ def _test_warn_not_in_group(self, backend):
         in_group_ranks = list(filter(lambda x: x % 2 == 0, range(self.world_size)))
         group = dist.new_group(in_group_ranks)
 
-        x = torch.zeros(2, 2).cuda(self.rank)
-        xs = [torch.zeros(2, 2).cuda(self.rank) for _ in range(len(in_group_ranks))]
+        x = torch.zeros(2, 2).to(self.rank)
+        xs = [torch.zeros(2, 2).to(self.rank) for _ in range(len(in_group_ranks))]
         if self.rank not in in_group_ranks:
             msg = ".*{}.*does not belong to.*"
             with self.assertWarnsOnceRegex(UserWarning, msg.format("all_gather")):
@@ -1392,7 +1396,7 @@ def _test_bool_tensors(self, backend):
             rank=self.rank,
             store=store,
         )
-        device = "cuda" if backend == "nccl" else "cpu"
+        device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
         # test alltoall_base
         tensor = torch.tensor([1, 0, 0, 1], dtype=torch.bool, device=device)
         zeros = torch.tensor([0, 0, 0, 0], dtype=torch.bool, device=device)
@@ -1574,8 +1578,8 @@ def test_debug_level(self):
 
 class DummyWork(dist._Work):
     def wait(self, timeout=5.0):
-        if torch.cuda.is_available():
-            torch.cuda.current_stream().synchronize()
+        if torch.accelerator.is_available():
+            torch.accelerator.current_stream().synchronize()
         return True
 
 
@@ -1790,6 +1794,18 @@ def test_backend_config(self):
             ("cpu:gloo,cuda:nccl", "cpu:gloo,cuda:nccl"),
         ]
 
+        if TEST_XPU:
+            # Override backend_config_strings_and_expected_values for Intel GPU.
+            backend_config_strings_and_expected_values[4:10] = [
+                (dist.Backend.DUMMY, "cpu:dummy,cuda:dummy,xpu:dummy"),
+                ("DUMMY", "cpu:dummy,cuda:dummy,xpu:dummy"),
+                ("dummy", "cpu:dummy,cuda:dummy,xpu:dummy"),
+                ("cpu:dummy,xpu:dummy", "cpu:dummy,xpu:dummy"),
+                ("cpu:dummy,xpu:xccl", "cpu:dummy,xpu:xccl"),
+                ("cpu:gloo,xpu:dummy", "cpu:gloo,xpu:dummy"),
+                ("cpu:gloo,xpu:xccl", "cpu:gloo,xpu:xccl"),
+            ]
+
         for config_str, expected_value in backend_config_strings_and_expected_values:
             with self.subTest(config_str):
                 # ensures these configs strings are valid and no ValueError is raised
@@ -1800,6 +1816,8 @@ def test_backend_config(self):
         invalid_backend_config_strings = [
             "cpu:gloo,cuda:nccl,",  # trailing comma
             "cpu:gloo,cuda:nccl,cpu:dummy",  # duplicate device
+            "cpu:gloo,xpu:xccl,",  # trailing comma
+            "cpu:gloo,xpu:xccl,cpu:dummy",  # duplicate device
         ]
         for config_str in invalid_backend_config_strings:
             with self.subTest(config_str):
@@ -1814,7 +1832,7 @@ def test_init_process_group_with_multiple_backends(self):
         os.environ["MASTER_ADDR"] = "localhost"
         os.environ["MASTER_PORT"] = "6789"
         dist.init_process_group(
-            "cpu:dummy,cuda:dummy", rank=self.rank, world_size=self.world_size
+            "cpu:dummy,cuda:dummy,xpu:dummy", rank=self.rank, world_size=self.world_size
         )
 
         # test all_gather
@@ -2053,7 +2071,7 @@ def _call_collective_with_varying_tensors(self, backend, collective, *args):
         # correctly dispatched
 
         # TODO: this will be updated in the future to not be backend specific
-        device = "cuda" if backend == "nccl" else "cpu"
+        device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
         # ensure supported devices (cpu, cuda) succeeds during dispatch call
         tensor = torch.zeros(2, 2, device=torch.device(device))
         # multi tensor collectives
@@ -2119,7 +2137,7 @@ def _test_all_to_all_single(self, backend):
             rank=self.rank,
             store=store,
         )
-        device = "cuda" if backend == "nccl" else "cpu"
+        device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
         # test alltoall_base
         input_tensor = torch.ones(2, 2, device=torch.device(device))
         output_tensor = torch.zeros(2, 2, device=torch.device(device))
@@ -2251,8 +2269,9 @@ def testNodeLocalRank(self):
 
 
 if __name__ == "__main__":
-    assert not torch.cuda._initialized, (
-        "test_distributed must not have initialized CUDA context on main process"
-    )
+    if device_type != "cpu":
+        assert not torch.get_device_module()._initialized, (
+            "test_distributed must not have initialized {device_type} context on main process"
+        )
 
     run_tests()
diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
index bafc781b591c6..6a5df5d988a30 100644
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@@ -21,15 +21,15 @@
     reduce_scatter_tensor,
     reduce_scatter_tensor_coalesced,
 )
-from torch.testing._internal.common_cuda import SM90OrLater
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
+from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
-    requires_nccl,
+    requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
     run_tests,
-    skipIfRocm,
     TestCase,
 )
 from torch.testing._internal.distributed.fake_pg import FakeStore
@@ -59,7 +59,7 @@ def load_test_module(name):
     sys.exit(0)
 
 
-@requires_nccl()
+@requires_accelerator_dist_backend(["nccl", "xccl"])
 class TestWithNCCL(MultiProcessTestCase):
     def setUp(self) -> None:
         super().setUp()
@@ -75,13 +75,15 @@ def ranks(self) -> list[int]:
 
     @property
     def device(self) -> torch.device:
-        return torch.device(f"cuda:{self.rank}")
+        return torch.device(self.rank)
 
     def _init_process_group(self) -> None:
-        torch.cuda.set_device(self.device)
+        torch.accelerator.set_device_index(self.rank)
         store = dist.FileStore(self.file_name, self.world_size)
+        backend = dist.get_default_backend_for_device(self.device.type)
+
         dist.init_process_group(
-            backend="nccl",
+            backend=backend,
             world_size=self.world_size,
             rank=self.rank,
             store=store,
@@ -273,7 +275,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         )
         # check memory leak
         for i in range(1, 10):
-            mem_usage[i] = torch.cuda.max_memory_allocated()
+            mem_usage[i] = torch.accelerator.max_memory_allocated()
             compiled(arg)
 
         assert mem_usage[9] == mem_usage[8]
@@ -370,14 +372,16 @@ def test_reduce_scatter_tensor_coalesced(self) -> None:
     @skip_if_lt_x_gpu(2)
     def test_all_to_all_single(self) -> None:
         self._init_process_group()
-        torch.cuda.set_device(self.device)
+        torch.accelerator.set_device_index(self.rank)
 
         torch.manual_seed(42)
         send_sz_matrix = torch.randint(0, 20, (self.world_size, self.world_size))
 
         input_split_sizes = send_sz_matrix[self.rank].tolist()
         output_split_sizes = send_sz_matrix[:, self.rank].tolist()
-        input = torch.full((sum(input_split_sizes),), float(self.rank)).cuda()
+        input = torch.full((sum(input_split_sizes),), float(self.rank)).to(
+            self.device.type
+        )
 
         output = torch.ops._c10d_functional.all_to_all_single(
             input,
@@ -388,7 +392,7 @@ def test_all_to_all_single(self) -> None:
         output = torch.ops._c10d_functional.wait_tensor(output)
         expect = torch.cat(
             [
-                torch.full((sz,), float(rank)).cuda()
+                torch.full((sz,), float(rank)).to(self.device.type)
                 for rank, sz in enumerate(output_split_sizes)
             ]
         )
@@ -464,7 +468,7 @@ def test_unwaited(self) -> None:
     @fresh_cache()
     def test_threading(self):
         self._init_process_group()
-        device = torch.device(f"cuda:{self.rank}")
+        device = self.device
 
         def func(arg: torch.Tensor) -> torch.Tensor:
             buf0 = arg + 42
@@ -501,10 +505,9 @@ def join(self):
         t.start()
         t.join()
 
-    @skipIfRocm
     @unittest.skipIf(
-        not SM90OrLater,
-        "_scaled_mm currently only supports sm>=90",
+        not PLATFORM_SUPPORTS_FP8,
+        "_scaled_mm currently only supports sm>=90 on cuda and gfx94/95 on ROCm",
     )
     @skip_if_lt_x_gpu(2)
     @fresh_cache()
@@ -513,10 +516,9 @@ def test_fixed_striding(self):
 
         def scale(t):
             scale = (
-                torch.finfo(torch.float8_e4m3fn).max
-                / t.abs().amax(dim=-1, keepdim=True).float()
+                torch.finfo(e4m3_type).max / t.abs().amax(dim=-1, keepdim=True).float()
             )
-            t = t.mul(scale).to(torch.float8_e4m3fn)
+            t = t.mul(scale).to(e4m3_type)
             return t, scale
 
         def fp8_rowwise_backward(in_, w, out_grad):
@@ -546,9 +548,9 @@ def fp8_rowwise_backward(in_, w, out_grad):
             return in_grad, w_grad
 
         m, n, k = 128, 256, 64
-        in_ = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
-        w = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
-        out_grad = torch.randn((m, n), device="cuda", dtype=torch.bfloat16)
+        in_ = torch.randn((m, k), device=self.device.type, dtype=torch.bfloat16)
+        w = torch.randn((n, k), device=self.device.type, dtype=torch.bfloat16)
+        out_grad = torch.randn((m, n), device=self.device.type, dtype=torch.bfloat16)
 
         eager_in_grad, eager_w_grad = fp8_rowwise_backward(in_, w, out_grad)
         compile_in_grad, compile_w_grad = torch.compile(fp8_rowwise_backward)(
@@ -777,7 +779,8 @@ def setUp(self):
 
         self.rank = 0
         self.world_size = 2
-        torch.cuda.set_device("cuda:0")
+        torch.accelerator.set_device_index(0)
+        self.device = torch.accelerator.current_accelerator()
 
         store = FakeStore()
         dist.init_process_group(
@@ -803,7 +806,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             ar1 = funcol.wait_tensor(ar1)
             return ar0, ar1
 
-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device)
         compiled = torch.compile(func)
 
         code = run_and_get_triton_code(compiled, arg)
@@ -836,7 +839,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -851,7 +854,7 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
             ar1 = [funcol.wait_tensor(out) for out in ar1]
             return ar0, ar1
 
-        args = [torch.rand(4, 4, device="cuda") for _ in range(2)]
+        args = [torch.rand(4, 4, device=self.device.type) for _ in range(2)]
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, args)
         buf0, buf1, buf2, buf3 = find_buffer_assignments(code)
@@ -881,7 +884,7 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
 
         # Test aoti
         out = AOTIRunnerUtil.run(func, (args,))  # noqa: F841
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -892,7 +895,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             ar0 = funcol.wait_tensor(ar0)
             return ar0
 
-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func)
 
         code = run_and_get_triton_code(compiled, arg)
@@ -917,7 +920,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             # Expect allocation
             return ar0
 
-        arg = torch.rand(4, 4, device="cuda").T
+        arg = torch.rand(4, 4, device=self.device.type).T
         compiled = torch.compile(func)
 
         code = run_and_get_triton_code(compiled, arg)
@@ -948,7 +951,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             buf2 = torch.mm(arg, buf1)
             return buf1, buf2
 
-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         buf0, buf1 = find_buffer_assignments(code)
@@ -978,7 +981,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             ag0 = funcol.wait_tensor(ag0)
             return ag0
 
-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         (
@@ -995,7 +998,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -1005,7 +1008,7 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
             ag0 = [funcol.wait_tensor(out) for out in ag0]
             return ag0
 
-        args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
+        args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, args)
         (
@@ -1029,7 +1032,7 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
 
         # Test aoti
         out = AOTIRunnerUtil.run(func, (args,))  # noqa: F841
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "This is a GPU test!")
     @fresh_cache()
@@ -1039,7 +1042,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             return funcol.wait_tensor(t)
 
         # Test aoti
-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         (
@@ -1051,7 +1054,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -1061,7 +1064,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             rs0 = funcol.wait_tensor(rs0)
             return rs0
 
-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         (
@@ -1077,7 +1080,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -1089,7 +1092,7 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
             rs0 = [funcol.wait_tensor(out) for out in rs0]
             return rs0
 
-        args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
+        args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, args)
         (
@@ -1113,7 +1116,7 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (args,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -1142,7 +1145,9 @@ def func(
 
         input_split_sizes = send_sz_matrix[self.rank]
         output_split_sizes = send_sz_matrix[:, self.rank].contiguous()
-        input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).cuda()
+        input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).to(
+            self.device.type
+        )
 
         with torch._dynamo.config.patch(
             dynamic_shapes=True,
@@ -1176,7 +1181,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             br1 = funcol.wait_tensor(br1)
             return br0, br1
 
-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func)
 
         code = run_and_get_triton_code(compiled, arg)
@@ -1199,7 +1204,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -1214,7 +1219,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             ar1 = funcol.wait_tensor(ar1)
             return ar0, ar1
 
-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func, fullgraph=True)
 
         code = run_and_get_triton_code(compiled, arg)
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 568ad5f5cdb5a..592a8d9c90821 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -83,7 +83,6 @@
     )
     sys.exit(0)
 
-# bfloat16 is only supported by CUDA 11+
 BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
     torch.version.cuda is not None or torch.version.hip is not None
 )
@@ -1095,6 +1094,62 @@ def test_comm_split_group(self):
 
         dist.destroy_process_group()
 
+    @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_comm_split_group_mixed_backend(self):
+        # Test `ncclCommSplit` for smaller subgroups of the world when
+        # we've passed a specific device_id to init_process_group.
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        # pg = self._create_process_group_nccl(store, self.opts(), device_id=device)
+        # create nccl processgroup with opts
+        c10d.init_process_group(
+            "cpu:gloo,cuda:nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            pg_options=self.opts(),
+            device_id=device,
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+        backend = pg._get_backend(torch.device(device))
+
+        cuda_tensor = torch.full((1,), self.rank).cuda(device)
+        cpu_tensor = torch.full((1,), self.rank)
+        # Create subgroup between ranks 0, 1
+        subg_ranks = [0, 1]
+        ng1 = c10d.split_group(pg, [subg_ranks])
+        backend1 = ng1._get_backend(torch.device(device))
+
+        # check basic options are the same between parent and child
+        self.assertEqual(backend.options._timeout, backend1.options._timeout)
+        self.assertEqual(
+            backend.options.is_high_priority_stream,
+            backend1.options.is_high_priority_stream,
+        )
+        self.assertEqual(ng1.group_desc, "default_pg:split:0")
+
+        # comm split happens eagerly since device_id is passed to init_process_group.
+        self.assertEqual(backend.comm_split_count(), 1)
+        # dist.get_process_group_ranks returns the global ranks in the subgroup.
+        self.assertEqual(
+            dist.get_process_group_ranks(ng1),
+            subg_ranks if self.rank in subg_ranks else [],
+        )
+
+        # is part of ng1; otherwise, -1
+        if dist.get_rank(ng1) >= 0:
+            dist.broadcast(cuda_tensor, dist.get_global_rank(ng1, 0), group=ng1)
+            self.assertEqual(cuda_tensor, torch.full((1,), 0))
+            dist.broadcast(cpu_tensor, dist.get_global_rank(ng1, 0), group=ng1)
+            self.assertEqual(cpu_tensor, torch.full((1,), 0))
+
+        ng2 = c10d.split_group(pg, [subg_ranks])
+        self.assertEqual(ng2.group_desc, "default_pg:split:1")
+        self.assertEqual(backend.comm_split_count(), 2)
+
+        dist.destroy_process_group()
+
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_non_blocking_init(self):
@@ -2846,6 +2901,25 @@ def _reduce_timeout(self):
         os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "4"
         os.environ["TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"] = "1000"
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(3)
+    @skip_if_rocm_multiprocess
+    def test_send_recv_non_dense_tensor(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device("cuda", self.rank % torch.cuda.device_count())
+        dist.init_process_group(
+            rank=self.rank, world_size=self.world_size, store=store, device_id=device
+        )
+        full = torch.empty((64, 64), device=device).fill_(self.rank)
+        # Take a slice in col dimension, making it non-dense
+        block = full[:, 16:32]
+        if self.rank == 0:
+            with self.assertRaises(ValueError):
+                dist.send(block, dst=1)
+        elif self.rank == 1:
+            with self.assertRaises(ValueError):
+                dist.recv(block, src=0)
+
     @requires_nccl()
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
@@ -3097,19 +3171,24 @@ def test_invalid_nccl_blocking_wait_env(self):
 class NcclUserBufferRegistrationTest(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
-        # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
-        # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
-        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
         nccl_debug_file = tempfile.NamedTemporaryFile()
-        os.environ["NCCL_ALGO"] = "NVLS"
-        os.environ["NCCL_DEBUG"] = "INFO"
-        os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS"
+        nccl_env = {
+            # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+            # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
+            "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1",
+            "NCCL_ALGO": "NVLS",
+            "NCCL_DEBUG": "INFO",
+            "NCCL_DEBUG_SUBSYS": "NVLS",
+            "NCCL_DEBUG_FILE": nccl_debug_file.name,
+        }
         if torch.cuda.nccl.version() >= (2, 24, 3):
-            os.environ["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
-        os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name
+            nccl_env["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
+        self.env_patcher = mock.patch.dict(os.environ, nccl_env)
+        self.env_patcher.start()
         self._spawn_processes()
 
     def tearDown(self):
+        self.env_patcher.stop()
         super().tearDown()
         try:
             os.remove(self.file_name)
@@ -3750,6 +3829,27 @@ def test_allgather_base(self):
         dist.all_gather_into_tensor(output_tensor, tensor)
         self.assertEqual(output_tensor, tensor)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_allgather_noncontig(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        device = "cuda"
+        tensor = (
+            torch.arange(0, 16, device=torch.device(device))
+            .view(2, 2, 2, 2)
+            .to(memory_format=torch.channels_last)
+        )
+        tensor_list = [torch.empty_like(tensor) for _ in range(self.world_size)]
+        dist.all_gather(tensor_list, tensor)
+        for o in tensor_list:
+            self.assertEqual(o, tensor)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(1)
     @parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index 5672171d0be4d..4cee5c38cf8a3 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -1,6 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 import os
+import unittest
 
 import torch
 import torch.distributed as dist
@@ -26,7 +27,7 @@
 )
 from torch.distributed.tensor.placement_types import _Partial, Shard
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
@@ -35,6 +36,10 @@
 from torch.utils._typing_utils import not_none
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+device_count = torch.accelerator.device_count()
+
+
 def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_rank=-1):
     os.environ["MASTER_ADDR"] = addr
     os.environ["MASTER_PORT"] = port
@@ -44,6 +49,7 @@ def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_ran
         os.environ["LOCAL_RANK"] = f"{local_rank}"
 
 
+@unittest.skipIf(TEST_XPU, "XPU does not support gloo backend.")
 class DeviceMeshTestGlooBackend(DTensorTestBase):
     @property
     def backend(self):
@@ -73,14 +79,16 @@ def test_manual_set_device(self):
 
         # Set the device on each process before DeviceMesh constructor,
         # and device to be different than the default world rank
-        torch.cuda.set_device((self.rank + 2) % self.world_size)
+        torch.accelerator.set_device_index((self.rank + 2) % self.world_size)
         _set_env_var(world_size=self.world_size, rank=self.rank)
         DeviceMesh(self.device_type, mesh_tensor)
         self.assertTrue(is_initialized())
 
         # check that the device is set to the correct device
         # and respect the previous set_device calls
-        self.assertEqual(torch.cuda.current_device(), (self.rank + 2) % self.world_size)
+        self.assertEqual(
+            torch.accelerator.current_device_idx(), (self.rank + 2) % self.world_size
+        )
         self.destroy_pg()
 
     @skip_if_lt_x_gpu(4)
@@ -101,7 +109,7 @@ def test_auto_set_device_from_local_rank(self):
 
         # check that the device is set to the correct device
         # and respect the LOCAL_RANK env var
-        self.assertEqual(torch.cuda.current_device(), local_rank)
+        self.assertEqual(torch.accelerator.current_device_idx(), local_rank)
         self.destroy_pg()
 
     @skip_if_lt_x_gpu(4)
@@ -120,7 +128,7 @@ def test_auto_set_device_from_heuristic(self):
         self.assertTrue(is_initialized())
 
         # check that the device is set to the correct device
-        self.assertEqual(torch.cuda.current_device(), self.rank)
+        self.assertEqual(torch.accelerator.current_device_idx(), self.rank)
         self.destroy_pg()
 
 
@@ -222,7 +230,7 @@ def test_get_local_rank(self):
     @with_comms
     def test_device_mesh_2d(self):
         mesh_tensor = torch.arange(4).reshape(2, 2)
-        # construct a cuda device mesh
+        # construct a device mesh for self.device_type
         mesh = DeviceMesh(self.device_type, mesh_tensor)
 
         # check all dim groups
@@ -246,19 +254,25 @@ def test_device_mesh_2d(self):
 
     @with_comms
     def test_device_mesh_init_backend(self):
-        mesh = DeviceMesh(self.device_type, [1], _init_backend=False)
+        mesh = DeviceMesh(
+            self.device_type, torch.arange(10), _init_backend=False, _rank=5
+        )
 
         with self.assertRaisesRegex(RuntimeError, "process groups not initialized!"):
             mesh.get_group()
 
         # coordinates should always been populated when init_backend is False, as whenever
         # we call init_backend we should make sure the default pg already created
-        mesh.get_coordinate()
+        self.assertEqual(mesh.get_coordinate(), [5])
 
     def test_fake_pg_device_mesh(self):
         fake_store = FakeStore()
         init_process_group("fake", store=fake_store, rank=0, world_size=self.world_size)
-        device_type = "cuda" if torch.cuda.is_available() else "cpu"
+        device_type = (
+            torch.accelerator.current_accelerator().type
+            if torch.accelerator.is_available()
+            else "cpu"
+        )
         mesh = DeviceMesh(device_type, torch.arange(self.world_size))
 
         local_tensor = torch.randn(2, 8)
@@ -298,7 +312,7 @@ def test_from_group_with_invalid_mesh(self):
         regex = r"Invalid mesh \[\[0, 1\], \[2, 3\]\] for ProcessGroup with ranks \[0, 1, 2, 3\]"
         with self.assertRaisesRegex(ValueError, regex):
             DeviceMesh.from_group(
-                global_pg, "cuda", invalid_mesh, mesh_dim_names=("dim0", "dim1")
+                global_pg, device_type, invalid_mesh, mesh_dim_names=("dim0", "dim1")
             )
 
         device_mesh = init_device_mesh(self.device_type, (2, 2))
@@ -318,12 +332,16 @@ def test_raises_invalid_device_type(self):
             # test init_device_mesh with an invalid device type that contains a GPU index
             mesh_shape = (2, self.world_size // 2)
             init_device_mesh(
-                "cuda:0", mesh_shape=mesh_shape, mesh_dim_names=("dp", "tp")
+                f"{device_type}:0", mesh_shape=mesh_shape, mesh_dim_names=("dp", "tp")
             )
 
     @with_comms
     def test_set_mesh_dim_group_options(self):
-        device_type = "cuda" if torch.cuda.is_available() else "cpu"
+        device_type = (
+            torch.accelerator.current_accelerator().type
+            if torch.accelerator.is_available()
+            else "cpu"
+        )
         _mesh_resources._set_mesh_dim_group_options(1, "fake", None)
 
         mesh_tensor = torch.arange(4).reshape(2, 2)
@@ -339,7 +357,7 @@ def world_size(self):
 
     @with_comms
     def test_device_mesh_nd(self):
-        # construct a cuda device mesh
+        # construct a device mesh for self.device_type
         mesh_tensor = torch.arange(8).reshape(2, 2, 2)
         mesh = DeviceMesh(self.device_type, mesh_tensor)
 
@@ -708,7 +726,9 @@ def test_raises_invalid_mesh_dim_name(self):
         with self.assertRaisesRegex(KeyError, "Invalid mesh_dim_name"):
             mesh_dim_names = ("DP", "TP")
             mesh = init_device_mesh(
-                self.device_type, (2, 4), mesh_dim_names=mesh_dim_names
+                self.device_type,
+                (2, 4),
+                mesh_dim_names=mesh_dim_names,
             )
             mesh[child_mesh_dim_name]
 
@@ -823,6 +843,15 @@ def test_get_item_3d_noncontiguous_slicing(self):
         ):
             mesh_3d["cp", "dp"]
 
+    @with_comms
+    def test_flatten_mesh_1d(self):
+        mesh_shape = (4,)
+        mesh_dim_names = ("default",)
+        mesh_1d = init_device_mesh(
+            self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
+        )
+        mesh_1d._flatten()
+
     @with_comms
     def test_flatten_mesh_3d(self):
         mesh_shape = (2, 2, 2)
@@ -831,6 +860,13 @@ def test_flatten_mesh_3d(self):
             self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
         )
 
+        # Test flatten into an existing mesh_dim_name inside the mesh
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "already exists for submesh of the DeviceMesh",
+        ):
+            mesh_3d._flatten("dp")
+
         # Test flatten contiguous dims
         dp_cp_mesh = mesh_3d["dp", "cp"]
         flattened_dp_cp_mesh = dp_cp_mesh._flatten()
@@ -920,7 +956,9 @@ def world_size(self):
     @with_comms
     def test_get_root_mesh(self):
         mesh_3d = init_device_mesh(
-            self.device_type, (2, 2, 2), mesh_dim_names=("dp", "cp", "tp")
+            self.device_type,
+            (2, 2, 2),
+            mesh_dim_names=("dp", "cp", "tp"),
         )
 
         dp_cp_mesh = mesh_3d["dp", "cp"]
@@ -968,7 +1006,9 @@ def test_get_mesh_dim_by_name(self):
     @with_comms
     def test_get_all_submeshes(self):
         mesh_2d = init_device_mesh(
-            self.device_type, (2, 4), mesh_dim_names=("replicate", "shard")
+            self.device_type,
+            (2, 4),
+            mesh_dim_names=("replicate", "shard"),
         )
         all_submeshes = _mesh_resources._get_all_submeshes(mesh_2d, "replicate")
         self.assertEqual(len(all_submeshes), 4)
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index d3436bbe47548..af07e50435a81 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -43,11 +43,12 @@
     DynamoDistributedMultiProcTestCase,
     DynamoDistributedSingleProcTestCase,
     import_transformers_or_skip,
-    requires_nccl,
+    requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import requires_cuda
+from torch.testing._internal.common_utils import skipIfXpu
 from torch.testing._internal.inductor_utils import HAS_GPU
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 def reset_rng_state():
@@ -270,7 +271,15 @@ def get_hf_bert(rank):
     except ImportError as e:
         raise unittest.SkipTest("Unable to import transformers") from e
 
-    batch_size, max_length, config, device = 4, 512, BertConfig(), f"cuda:{rank}"
+    device_type = (
+        acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+    )
+    batch_size, max_length, config, device = (
+        4,
+        512,
+        BertConfig(),
+        f"{device_type}:{rank}",
+    )
     model = AutoModelForMaskedLM.from_config(config).to(device)
     input_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(device)
     decoder_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(
@@ -550,8 +559,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 # Are these tests failing?  Check and see if TestFakeDistributedSingleProc has a
 # single process version; if it's just a problem in the Dynamo distributed
-# optimizer, you should be able to repro it single process!
-@requires_nccl()
+# # optimizer, you should be able to repro it single process!
+@requires_accelerator_dist_backend(["nccl", "xccl"])
 class TestMultiProc(DynamoDistributedMultiProcTestCase):
     """
     Note: MultiProcTestCase spawns processes per test and is slow.
@@ -559,12 +568,16 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
     sparingly for integration tests.
     """
 
+    device_type = (
+        acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+    )
+
     @skip_if_lt_x_gpu(2)
     @config.patch(optimize_ddp=False, enable_compiler_collectives=True)
     def test_ddp_baseline_aot_eager_multiprocess(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             self.assertFalse(config.optimize_ddp)
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
             m = DDP(m, device_ids=[self.rank])
             m = torch.compile(m, backend="aot_eager")
             outputs = m(inputs)
@@ -632,7 +645,7 @@ def forward(self, inp):
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             self.assertFalse(config.optimize_ddp)
-            model = MyModel().to(device="cuda")
+            model = MyModel().to(device=self.device_type)
 
             # Activation checkpointing for Linear layers.
             non_reentrant_wrapper = functools.partial(
@@ -647,7 +660,7 @@ def forward(self, inp):
             )
 
             model = DDP(model)
-            x = torch.randn(10, 64).cuda()
+            x = torch.randn(10, 64).to(self.device_type)
             correct_outputs = model(x)
 
             opt_model = torch.compile(model)
@@ -659,14 +672,14 @@ def forward(self, inp):
     def test_fsdp_aot_eager(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             # Test with basic FSDP wrapping (outer wrap around whole model)
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="aot_eager")
             outputs = fsdp_m(inputs)
             self.assertTrue(same(correct_outputs, outputs))
 
             # Test with recursive wrapping, nested FSDP around each Linear
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
             fsdp_m = FSDP(
                 m,
                 auto_wrap_policy=functools.partial(
@@ -680,6 +693,7 @@ def test_fsdp_aot_eager(self):
 
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @requires_cuda_and_triton
     def test_ddp_optimizer_cudagraph(self):
         class Net(nn.Module):
             def __init__(self):
@@ -730,7 +744,9 @@ def test_fsdp_setattr(self):
             from torch._dynamo.utils import counters
 
             counters.clear()
-            m, inputs, correct_outputs = get_mutating_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_mutating_model(
+                f"{self.device_type}:{self.rank}"
+            )
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="eager", fullgraph=False)
             outputs = fsdp_m(inputs)
@@ -748,7 +764,9 @@ def test_fsdp_unspecialized_forced_getattr_no_inline(self):
             from torch._dynamo.utils import counters
 
             counters.clear()
-            m, inputs, correct_outputs = get_forced_getattr_module(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_forced_getattr_module(
+                f"{self.device_type}:{self.rank}"
+            )
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="eager", fullgraph=False)
             outputs = fsdp_m(inputs)
@@ -762,7 +780,9 @@ def test_fsdp_unspecialized_forced_getattr_inline(self):
             from torch._dynamo.utils import counters
 
             counters.clear()
-            m, inputs, correct_outputs = get_forced_getattr_module(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_forced_getattr_module(
+                f"{self.device_type}:{self.rank}"
+            )
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="eager", fullgraph=False)
             outputs = fsdp_m(inputs)
@@ -774,14 +794,14 @@ def test_fsdp_unspecialized_forced_getattr_inline(self):
     def test_fsdp_inductor(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             # Test with basic FSDP wrapping (outer wrap around whole model)
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="inductor")
             outputs = fsdp_m(inputs)
             self.assertTrue(same(correct_outputs, outputs))
 
             # Test with recursive wrapping, nested FSDP around each Linear
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
             fsdp_m = FSDP(
                 m,
                 auto_wrap_policy=functools.partial(
@@ -799,7 +819,7 @@ def test_fsdp_inductor(self):
     def test_fsdp_activation_checkpointing(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             model, inputs = get_toy_model_for_activation_checkpointing(
-                f"cuda:{self.rank}"
+                f"{self.device_type}:{self.rank}"
             )
             is_inner = lambda module: isinstance(module, ToyInnerModel)  # noqa: E731
             wrap_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=is_inner)
@@ -961,7 +981,7 @@ def test_compiler_collectives_automatic_dynamic_scalar(self):
             torch._dynamo.utils.clear_compilation_metrics()
 
             # TODO: This should be possible to do inside the function, but
-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"
 
             @torch.compile()
             def f(x, y):
@@ -1181,7 +1201,7 @@ def test_get_pg_attr(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             pg = dist.distributed_c10d._get_default_group()
 
-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"
 
             @torch.compile(fullgraph=True)
             def f(x):
@@ -1196,6 +1216,7 @@ def f(x):
             pg = dist.distributed_c10d.GroupMember.NON_GROUP_MEMBER
             self.assertEqual(f(x), x + 1)
 
+    @skipIfXpu  # ProcessGroupXCCL doesn't support _set_default_timeout yet.
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(torch._inductor.config, "fx_graph_cache", False)
     @patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
@@ -1205,7 +1226,7 @@ def test_asymmetric_compilation(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             torch._dynamo.utils.clear_compilation_metrics()
 
-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"
 
             pg = dist.distributed_c10d._get_default_group()
 
@@ -1238,7 +1259,7 @@ def f(x):
 
             w = pg.allreduce(x)
             w.wait()
-            torch.cuda.synchronize(device)
+            torch.accelerator.synchronize(device)
 
             metrics = torch._dynamo.utils.get_compilation_metrics()
             # Number of compiles same on all nodes
@@ -1247,6 +1268,7 @@ def f(x):
             for r in res[1:]:
                 self.assertEqual(res[0], r)
 
+    @skipIfXpu  # ProcessGroupXCCL doesn't support _set_default_timeout yet.
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(torch._inductor.config, "fx_graph_cache", True)
     @patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
@@ -1258,7 +1280,7 @@ def test_asymmetric_compilation_with_fx_cache(self):
         with fresh_cache(), _dynamo_dist_per_rank_init(self.rank, self.world_size):
             torch._dynamo.utils.clear_compilation_metrics()
 
-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"
 
             pg = dist.distributed_c10d._get_default_group()
 
@@ -1281,7 +1303,7 @@ def f(x):
 
             w = pg.allreduce(x)
             w.wait()
-            torch.cuda.synchronize(device)
+            torch.accelerator.synchronize(device)
             torch._dynamo.reset()
 
             if self.rank == 0:
@@ -1298,11 +1320,11 @@ def f(x):
 
             w = pg.allreduce(x)
             w.wait()
-            torch.cuda.synchronize(device)
+            torch.accelerator.synchronize(device)
 
 
-@requires_nccl()
-@requires_cuda
+@requires_accelerator_dist_backend(["nccl", "xccl"])
+@unittest.skipUnless(torch.accelerator.is_available(), "Requires accelerator")
 class TestSingleProc(DynamoDistributedSingleProcTestCase):
     """
     Test harness initializes dist process group.
@@ -1311,6 +1333,10 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
     Use TestMultiProc for things that really need to run on multiple nodes
     """
 
+    device_type = (
+        acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+    )
+
     def get_model(
         self, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5, ctx_manager=None
     ):
@@ -1428,6 +1454,7 @@ def opt_fn(inputs):
                 self.assertEqual(len(break_reasons), 4)
                 self.assertTrue(all("DDPOptimizer" in r.reason for r in break_reasons))
 
+    @skipIfXpu  # XPU device doesn't support flex_attention yet.
     @patch.object(config, "optimize_ddp", True)
     def test_compiled_flex_attention_full_model_ddp(self):
         class Model(torch.nn.Module):
@@ -1474,16 +1501,16 @@ def alibi_score_mod(self, score, b, h, q_idx, kv_idx):
         S = 512
         D = 64
 
-        device = "cuda"
         model = Model(S, H, D)
-        model.to(device)
+        model.to(self.device_type)
         model = torch.compile(model)
         model = DDP(model, device_ids=self.device_ids)
 
-        hidden_states = torch.randn(B, S, H * D).to(device)
+        hidden_states = torch.randn(B, S, H * D).to(self.device_type)
         model(hidden_states)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
+    @skipIfXpu  # XPU device doesn't support flex_attention yet.
     @patch.object(config, "optimize_ddp", True)
     def test_compiled_flex_attention_local_ddp(self):
         class Model(torch.nn.Module):
@@ -1530,15 +1557,14 @@ def alibi_score_mod(self, score, b, h, q_idx, kv_idx):
         S = 512
         D = 64
 
-        device = "cuda"
         model = Model(S, H, D)
-        model.to(device)
+        model.to(self.device_type)
         model = torch.compile(model)
         model = DDP(model, device_ids=self.device_ids)
 
-        hidden_states = torch.randn(B, S, H * D).to(device)
+        hidden_states = torch.randn(B, S, H * D).to(self.device_type)
         model(hidden_states)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 
     @patch.object(config, "optimize_ddp", True)
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@@ -1787,9 +1813,9 @@ def forward(self, x):
                 a = torch.cos(a)
                 return a
 
-        mod = MockModule().cuda()
+        mod = MockModule().to(self.device_type)
         mod = DDP(mod, bucket_cap_mb=1)
-        x = torch.randn(N, N, device="cuda", requires_grad=True)
+        x = torch.randn(N, N, device=self.device_type, requires_grad=True)
         args = (x,)
 
         backend = "aot_eager"
@@ -1799,7 +1825,7 @@ def forward(self, x):
 
     def test_fsdp_orig_params_assert(self):
         # Test with basic FSDP wrapping (outer wrap around whole model)
-        m, inputs, _ = get_model(f"cuda:{self.rank}")
+        m, inputs, _ = get_model(f"{self.device_type}:{self.rank}")
         fsdp_m = FSDP(m, use_orig_params=False)
         # Test is that this function call does not throw an exception.
         fsdp_m = torch.compile(fsdp_m)
@@ -1845,7 +1871,7 @@ def _(ctx):
 
                     return out
 
-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"
             m = ToyModel(
                 in_feat=10,
                 hidden_feat=5000,
@@ -1892,7 +1918,7 @@ def forward(self, inputs):
 
         torch._dynamo.reset()
 
-        device = f"cuda:{self.rank}"
+        device = f"{self.device_type}:{self.rank}"
         m = ToyModel(
             in_feat=10,
             hidden_feat=5000,
@@ -1933,9 +1959,14 @@ def test_fsdp_dup_tensors_same_source(self):
         class DuplicateModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
-                self._param = torch.randn((3,), device="cuda")
+                device_type = (
+                    acc.type
+                    if (acc := torch.accelerator.current_accelerator())
+                    else "cpu"
+                )
+                self._param = torch.randn((3,), device=device_type)
                 self._buf = torch.nn.Buffer(
-                    torch.randn((3,), requires_grad=False, device="cuda")
+                    torch.randn((3,), requires_grad=False, device=device_type)
                 )
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -1948,7 +1979,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         model = DuplicateModule()
         fsdp_model = FSDP(copy.deepcopy(model), use_orig_params=True)
         fsdp_model = torch.compile(fsdp_model, backend="aot_eager")
-        inp = torch.randn((2, 3), device="cuda")
+        inp = torch.randn((2, 3), device=self.device_type)
         local_out = model(inp)
         fsdp_out = fsdp_model(inp)
         self.assertEqual(local_out, fsdp_out)
@@ -1965,8 +1996,13 @@ def test_fsdp_dup_tensors_diff_source(self):
         class BufModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+                device_type = (
+                    acc.type
+                    if (acc := torch.accelerator.current_accelerator())
+                    else "cpu"
+                )
                 self._buf = nn.Buffer(
-                    torch.randn((3,), requires_grad=False, device="cuda")
+                    torch.randn((3,), requires_grad=False, device=device_type)
                 )
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -1975,7 +2011,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         class Model(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
-                self._param = nn.Parameter(torch.randn((1,), device="cuda"))
+                device_type = (
+                    acc.type
+                    if (acc := torch.accelerator.current_accelerator())
+                    else "cpu"
+                )
+                self._param = nn.Parameter(torch.randn((1,), device=device_type))
                 self._buf_module = BufModule()
                 # Share the buffer, meaning same tensor but different source
                 self._buf = self._buf_module._buf
@@ -1992,7 +2033,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         fsdp_model = FSDP(Model(), use_orig_params=True)
         cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
         fsdp_model = torch.compile(fsdp_model, backend=cnt)
-        inp = torch.randn((2, 3), device="cuda")
+        inp = torch.randn((2, 3), device=self.device_type)
         for _ in range(15):
             fsdp_model(inp)
         # Check for no recompiles (if there were incorrect de-dup guards, then
@@ -2011,7 +2052,12 @@ def __init__(self, use_self: bool):
                 super().__init__()
                 self._use_self = use_self
                 torch.manual_seed(42)  # force `_param` to be deterministic
-                self._param = nn.Parameter(torch.randn((3,), device="cuda"))
+                device_type = (
+                    acc.type
+                    if (acc := torch.accelerator.current_accelerator())
+                    else "cpu"
+                )
+                self._param = nn.Parameter(torch.randn((3,), device=device_type))
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 if self._use_self:
@@ -2026,7 +2072,7 @@ def _add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 return x + y
 
         model = ModuleWithStaticMethod(False)
-        x = torch.randn((2, 3), device="cuda")
+        x = torch.randn((2, 3), device=self.device_type)
         ref_out = model(x)
         test_outs: list[torch.Tensor] = []
 
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 51d12f6099ac4..c86bc120774cd 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -10,6 +10,7 @@
 import torch._dynamo
 import torch._dynamo.logging
 import torch._dynamo.test_case
+import torch.distributed as c10d
 
 # for some reason importing functional collectives after dynamo breaks collectives handling!
 import torch.distributed._functional_collectives as _functional_collectives
@@ -37,14 +38,14 @@
     DynamoDistributedMultiProcTestCase,
     DynamoDistributedSingleProcTestCase,
     MultiProcessTestCase,
-    requires_nccl,
+    requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
-    requires_cuda,
     skipIfRocm,
+    skipIfXpu,
     TEST_XPU,
     xfailIf,
 )
@@ -59,13 +60,15 @@ def _tolist_with_constrain_as_size(tensor):
     return lst
 
 
-@requires_nccl()
+@requires_accelerator_dist_backend(["nccl", "xccl"])
 @instantiate_parametrized_tests
 class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
     """
     Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
     """
 
+    device = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
     def get_world_trs(self):
         return {
             "tag": "",
@@ -102,8 +105,11 @@ def compile(func, example_inputs):
                 example,
                 **self.get_world_trs(),
             )
-            t = torch.randn(4, 4, device="cuda")
-            inputs = (t if self.rank == 0 else torch.zeros(4, 4, device="cuda"), 0)
+            t = torch.randn(4, 4, device=self.device)
+            inputs = (
+                t if self.rank == 0 else torch.zeros(4, 4, device=self.device),
+                0,
+            )
             eager_out = example(*inputs)
             self.assertTrue(same(t, eager_out))
 
@@ -137,7 +143,7 @@ def compile(func, example_inputs):
                 matmul_cat_col,
                 **self.get_world_trs(),
             )
-            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 6
+            inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 6
 
             eager_out = matmul_cat_col(*inputs)
             compiled_matmul_cat_col = compile(matmul_cat_col, inputs)
@@ -180,7 +186,7 @@ def func(x):
             for nelem in [1024, 2048, 4096]:
                 # CI (Tesla T4) does not support bfloat16 compilation natively,
                 # using float
-                x = torch.randn(nelem, device="cuda", dtype=torch.float)
+                x = torch.randn(nelem, device=self.device, dtype=torch.float)
                 golden_out = eager_func(x)
 
                 for _ in range(3):
@@ -218,8 +224,8 @@ def compile(func, example_inputs):
                 eager_func,
                 **self.get_world_trs(),
             )
-            eager_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 4
-            inductor_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+            eager_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 4
+            inductor_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2
 
             eager_out = inductor_func(eager_func(*eager_inputs), *inductor_inputs)
             compiled_inductor_func = compile(
@@ -257,8 +263,8 @@ def compile(func, example_inputs):
                 inductor_func,
                 **self.get_world_trs(),
             )
-            inductor_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 4
-            eager_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+            inductor_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 4
+            eager_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2
 
             eager_out = eager_func(inductor_func(*inductor_inputs), *eager_inputs)
             compiled_inductor_func = compile(inductor_func, inductor_inputs)
@@ -271,6 +277,7 @@ def compile(func, example_inputs):
     @skip_if_lt_x_gpu(2)
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1728
     @skipIfRocm
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1728
     def test_eager_async_allreduce_inductor_wait(self):
         import torch.distributed as dist
         from torch._inductor.utils import run_and_get_code
@@ -293,7 +300,7 @@ def all_reduce_wait(work, y):  # potentially compiled
             return y * y
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            x = torch.ones(12800, 12800, device="cuda") + self.rank
+            x = torch.ones(12800, 12800, device=self.device) + self.rank
             self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
 
             # NOTE: We run for 10 iterations each, to ensure that the GPU execution is way behind CPU
@@ -364,7 +371,7 @@ def func(a, *, tag, ranks, group_size):
             return (e,)
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            inputs = torch.ones(4, 4, device="cuda") + self.rank
+            inputs = torch.ones(4, 4, device=self.device) + self.rank
             compiled = torch.compile(func)
             out = compiled(inputs, **self.get_world_trs())
             correct = func(inputs, **self.get_world_trs())
@@ -381,7 +388,8 @@ def func(tensor, src_dst_pairs, *, tag, ranks, group_size):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             inputs = (
                 # rank0: [0., 1.], rank1: [2., 3.]
-                torch.arange(2, dtype=torch.float32, device="cuda") + 2 * self.rank,
+                torch.arange(2, dtype=torch.float32, device=self.device)
+                + 2 * self.rank,
                 [1, 0],
             )
             compiled = torch.compile(func)
@@ -390,7 +398,7 @@ def func(tensor, src_dst_pairs, *, tag, ranks, group_size):
             self.assertTrue(same(out, correct))
 
             # rank0: [2., 3.], rank1: [0., 1.]
-            expected = torch.arange(2, dtype=torch.float32, device="cuda") + 2 * (
+            expected = torch.arange(2, dtype=torch.float32, device=self.device) + 2 * (
                 (self.rank - 1 + self.world_size) % self.world_size
             )
             self.assertEqual(out, expected)
@@ -413,9 +421,9 @@ def forward(self, x, world_size, tag, ranks, group_size):
                 return out
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            model = Model().cuda()
+            model = Model().to(self.device)
             model_compiled = torch.compile(model)
-            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device="cuda")
+            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device=self.device)
             out = model_compiled(inp, self.world_size, **self.get_world_trs())
             correct = model(inp, self.world_size, **self.get_world_trs())
             self.assertTrue(same(out, correct))
@@ -430,7 +438,7 @@ def func(tensor, world_size):
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             func_compiled = torch.compile(func)
-            inp = torch.tensor(self.rank, dtype=torch.long, device="cuda")
+            inp = torch.tensor(self.rank, dtype=torch.long, device=self.device)
             out = func_compiled(inp, self.world_size)
             correct = func(inp, self.world_size)
             self.assertTrue(same(out, correct))
@@ -452,9 +460,9 @@ def forward(self, x, world_size, tag, ranks, group_size):
                 return out
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            model = Model().cuda()
+            model = Model().to(self.device)
             model_compiled = torch.compile(model)
-            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device="cuda")
+            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device=self.device)
             out = model_compiled(inp, self.world_size, **self.get_world_trs())
             correct = model(inp, self.world_size, **self.get_world_trs())
             self.assertTrue(same(out, correct))
@@ -483,7 +491,7 @@ def compile(func, example_inputs):
                 example,
                 **self.get_world_trs(),
             )
-            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+            inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2
 
             eager_out = example(*inputs)
             compiled_matmul_cat_col = compile(example, inputs)
@@ -510,7 +518,7 @@ def compile(func, example_inputs):
                 example,
                 **self.get_world_trs(),
             )
-            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+            inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2
 
             eager_out = example(*inputs)
             compiled_fn = compile(example, inputs)
@@ -564,7 +572,7 @@ def example(
                 dtype=torch.int64,
             )
             inputs = (
-                torch.ones(int(row), 5, device="cuda") * (self.rank + 1),
+                torch.ones(int(row), 5, device=self.device) * (self.rank + 1),
                 input_split_sizes_tensor,
                 output_split_sizes_tensor,
             )
@@ -733,7 +741,7 @@ def example(
                 dtype=torch.int64,
             )
             inputs = (
-                torch.ones(int(row), 5, device="cuda", requires_grad=True)
+                torch.ones(int(row), 5, device=self.device, requires_grad=True)
                 * (self.rank + 1),
                 input_split_sizes_tensor,
                 output_split_sizes_tensor,
@@ -796,7 +804,7 @@ def example(inp, *, tag, ranks, group_size):
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             inputs = (
-                torch.ones(self.world_size, self.world_size, device="cuda")
+                torch.ones(self.world_size, self.world_size, device=self.device)
                 * (self.rank + 1),
             )
             trs = self.get_world_trs()
@@ -820,8 +828,11 @@ def example(inp, *, tag, ranks, group_size):
 
 
 @instantiate_parametrized_tests
-@requires_nccl()
-@requires_cuda
+@requires_accelerator_dist_backend(["nccl", "xccl"])
+@unittest.skipIf(
+    not torch.accelerator.is_available(),
+    "No accelerator is available",
+)
 class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
     """
     Prefer single-proc test runner for basic tests as it is easier to work with.
@@ -844,7 +855,7 @@ def func(inp, *, tag, ranks, group_size):
             ar = torch.ops.c10d_functional.wait_tensor(ar)
             return ar
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
 
         compiled = torch.compile(func)
         out = compiled(inputs, **self.get_world_trs())
@@ -879,7 +890,7 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar, other
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
@@ -912,7 +923,7 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar, y, other
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
@@ -953,7 +964,7 @@ def func(inp):
             ar = _functional_collectives.all_reduce(inp, "sum", "0")
             return ar
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
         out = compiled(inputs)
@@ -964,12 +975,13 @@ def func(inp):
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     def test_dynamo_trace_all_gather_tensor(self):
         def func(inp):
             ar = _functional_collectives.all_gather_tensor(inp, 0, "0")
             return ar
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
         out = compiled(inputs)
@@ -980,6 +992,7 @@ def func(inp):
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     def test_dynamo_trace_all_gather_tensor_pg(self):
         def func(inp, *, pg):
             ar = _functional_collectives.all_gather_tensor(inp, 0, pg)
@@ -996,6 +1009,7 @@ def func(inp, *, pg):
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     def test_dynamo_rewrite_dist_all_gather(self):
         def func(inp, out, *, pg):
             torch.distributed.all_gather_into_tensor(
@@ -1021,6 +1035,7 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     def test_dynamo_rewrite_dist_all_gather_list(self):
         def func(inp, out, *, pg):
             torch.distributed.all_gather(
@@ -1043,6 +1058,7 @@ def func(inp, out, *, pg):
         assert counter.frame_count == 1
         assert same(outputs, correct_outputs)
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     def test_dynamo_rewrite_dist_all_gather_args_match(self):
         # Duplicated most of the structure from test_dynamo_rewrite_dist_all_gather
         # except uses kwargs to ensure rewrite has matching arg names
@@ -1071,6 +1087,7 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     def test_dynamo_rewrite_dist_reduce_scatter(self):
         def func(inp, out, *, pg):
             torch.distributed.reduce_scatter_tensor(
@@ -1238,6 +1255,7 @@ def verify(gm, _):
         input = torch.ones(2, device=self.device)
         compiled(input)
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     def test_dynamo_support_collective_op_with_async_op_False(self):
         def func(inp, out, *, pg):
             # user explicitly set the attribute `async_op` to False,
@@ -1297,12 +1315,13 @@ def func(inp, *, pg):
         assert counter.op_count == 1
         assert same(outputs, correct_outputs)
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     def test_dynamo_trace_reduce_scatter_tensor(self):
         def func(inp):
             ar = _functional_collectives.reduce_scatter_tensor(inp, "sum", 0, "0")
             return ar
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
         out = compiled(inputs)
@@ -1313,6 +1332,7 @@ def func(inp):
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     def test_dynamo_trace_allgather_coalesced(self):
         def func(inp, *, tag, ranks, group_size):
             ar = torch.ops.c10d_functional.all_gather_into_tensor_coalesced(
@@ -1320,7 +1340,10 @@ def func(inp, *, tag, ranks, group_size):
             )
             return ar
 
-        inputs = [torch.ones(4, 4, device="cuda"), torch.ones(6, 6, device="cuda")]
+        inputs = [
+            torch.ones(4, 4, device=self.device),
+            torch.ones(6, 6, device=self.device),
+        ]
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
         out = compiled(inputs, **self.get_world_trs())
@@ -1340,7 +1363,7 @@ def func(inp):
             ar = _functional_collectives.all_reduce(inp, "sum", "0")
             return ar
 
-        input = torch.ones(4, 4, device="cuda", requires_grad=True)
+        input = torch.ones(4, 4, device=self.device, requires_grad=True)
         compiled = torch.compile(
             func, backend="aot_eager"
         )  # inductor bug with single-op allreduce graph
@@ -1358,6 +1381,7 @@ def test_meta(self):
         out = torch.ops.c10d_functional.all_reduce(x, "sum", **self.get_world_trs())
         self.assertEqual(x.size(), out.size())
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
     def test_inductor_all_gather_coalesced(self):
@@ -1377,7 +1401,7 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar0, y, other, ar1
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
@@ -1404,6 +1428,7 @@ def func(inp, *, tag, ranks, group_size):
         correct = func(inputs, **self.get_world_trs())
         assert same(out, correct), f"{out} va {correct}"
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
     def test_inductor_reduce_scatter_coalesced(self):
@@ -1423,7 +1448,7 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar0, y, other, ar1
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
@@ -1450,6 +1475,7 @@ def func(inp, *, tag, ranks, group_size):
         correct = func(inputs, **self.get_world_trs())
         assert same(out, correct), f"{out} va {correct}"
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_reorder_peak_memory(self):
         """
@@ -1471,7 +1497,7 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar0, y, other, ar1
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
 
         # get stats directly from the internal helper without affecting the real pass's signature
         node_stats: Optional[dict[BaseSchedulerNode, ReorderInfo]] = None
@@ -1643,10 +1669,10 @@ def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
 
             return y, ag_0_out, ag_1_out
 
-        x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
-        w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        ag_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        ag_1 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        x = torch.ones(4, 384, device=self.device, dtype=torch.float32)
+        w = torch.ones(384, 512, device=self.device, dtype=torch.float32)
+        ag_0 = torch.ones(384, 512, device=self.device, dtype=torch.float32)
+        ag_1 = torch.ones(384, 512, device=self.device, dtype=torch.float32)
         inputs = [x, w, ag_0, ag_1]
 
         with torch._inductor.config.patch(
@@ -1811,12 +1837,12 @@ def func(x, w, ag_0, ag_1, ag_2, ag_3, *, tag, ranks, group_size):
                 rs_3_out,
             )
 
-        x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
-        w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        ag_0 = torch.ones(1024, 512, device="cuda", dtype=torch.float32)
-        ag_1 = torch.ones(512, 1024, device="cuda", dtype=torch.float32)
-        ag_2 = torch.ones(1024, 512, device="cuda", dtype=torch.float32)
-        ag_3 = torch.ones(512, 1024, device="cuda", dtype=torch.float32)
+        x = torch.ones(4, 384, device=self.device, dtype=torch.float32)
+        w = torch.ones(384, 512, device=self.device, dtype=torch.float32)
+        ag_0 = torch.ones(1024, 512, device=self.device, dtype=torch.float32)
+        ag_1 = torch.ones(512, 1024, device=self.device, dtype=torch.float32)
+        ag_2 = torch.ones(1024, 512, device=self.device, dtype=torch.float32)
+        ag_3 = torch.ones(512, 1024, device=self.device, dtype=torch.float32)
         inputs = [x, w, ag_0, ag_1, ag_2, ag_3]
 
         # get stats directly from the internal helper without affecting the real pass's signature
@@ -1918,6 +1944,7 @@ def _reorder_communication_preserving_peak_memory(
         node_stat1 = next(it)
         self.assertTrue("collective ordering" in node_stat1.limiting_factor)
 
+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_reorder_respects_wait_dep(self):
         """
@@ -1940,7 +1967,7 @@ def func(inp, *, tag, ranks, group_size):
             # ensure other is not incorrectly aliasing ar's buffer
             return ag_1_wait
 
-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
 
         # get stats directly from the internal helper without affecting the real pass's signature
         node_stats: Optional[dict[BaseSchedulerNode, ReorderInfo]] = None
@@ -1989,7 +2016,7 @@ def _reorder_communication_preserving_peak_memory(
             self.assertEqual(stats.moves, 0)
 
 
-@requires_nccl()
+@requires_accelerator_dist_backend(["nccl", "xccl"])
 class TestSyncDecisionCrossRanks(MultiProcessTestCase):
     def setUp(self) -> None:
         super().setUp()
@@ -2005,16 +2032,21 @@ def ranks(self) -> list[int]:
 
     @property
     def device(self) -> torch.device:
-        return torch.device(f"cuda:{self.rank}")
+        device_type = torch.accelerator.current_accelerator().type
+        return torch.device(f"{device_type}:{self.rank}")
 
     def _init_process_group(self) -> None:
         torch._inductor.config.triton.store_cubin = True
         torch._inductor.config.debug = True
 
-        torch.cuda.set_device(self.device)
+        torch.get_device_module(self.device).set_device(self.device)
         store = torch.distributed.FileStore(self.file_name, self.world_size)
+        backend = c10d.get_default_backend_for_device(
+            torch.accelerator.current_accelerator().type
+        )
+
         torch.distributed.init_process_group(
-            backend="nccl",
+            backend=backend,
             world_size=self.world_size,
             rank=self.rank,
             store=store,
diff --git a/test/distributed/test_nvshmem.py b/test/distributed/test_nvshmem.py
index 18db50582d27e..9ef56c4cfa9c3 100644
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@@ -240,8 +240,6 @@ class NVSHMEMAll2AllTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
         # TODO: relieve this (seems to hang if without)
         device_module.set_device(self.device)
-        # NOTE: required for nvshmem allocation
-        torch.empty(1, device=self.device)
         # Set NVSHMEM as SymmMem backend
         symm_mem.set_backend("NVSHMEM")
 
@@ -307,6 +305,9 @@ def test_all_to_all_vdev(self) -> None:
         # Row 0 is input splits
         in_out_splits[0].copy_(inp_splits)
 
+        # Sync all ranks to ensure remote tensors are allocated
+        dist.barrier()
+
         torch.ops.symm_mem.all_to_all_vdev(inp, out, in_out_splits, group_name)
 
         # Check input splits (row 0) -- should not change
@@ -375,6 +376,9 @@ def test_all_to_all_vdev_2d(self, align: int) -> None:
             (2, nsplits), dtype=torch.int64, device=self.device
         ).fill_(-1)
 
+        # Sync all ranks to ensure remote tensors are allocated
+        dist.barrier()
+
         torch.ops.symm_mem.all_to_all_vdev_2d(
             inp, out, in_splits, out_splits_offsets, group_name, major_align=align
         )
@@ -490,6 +494,9 @@ def test_all_to_all_vdev_2d_offset(self) -> None:
         # Row 1 is input offsets
         in_splits_offsets[1].copy_(inp_offsets)
 
+        # Sync all ranks to ensure remote tensors are allocated
+        dist.barrier()
+
         torch.ops.symm_mem.all_to_all_vdev_2d_offset(
             inp, out, in_splits_offsets, out_splits_offsets, group_name
         )
@@ -548,83 +555,103 @@ def test_all_to_all_vdev_2d_offset(self) -> None:
         # Check data
         torch.testing.assert_close(out_expected, out[:out_numel])
 
-    def helper_test_dispatch_combine(self, align: int, group_name) -> None:
-        """
-        Shuffle the tokens, then combine them, and check if the combined data is
-        exactly the same as the original input data
-        """
-        symm_mem.enable_symm_mem_for_group(group_name)
 
-        dtype = torch.float
-        # Number of experts per rank
-        ne = 8
-        nsplits = ne * self.world_size
+# Help function used by multiple tests
+def dispatch_then_combine(device, align: int, group) -> None:
+    """
+    Shuffle the tokens, then combine them, and check if the combined data is
+    exactly the same as the original input data
+    """
+    group_name = group.group_name
+    symm_mem.enable_symm_mem_for_group(group_name)
+
+    dtype = torch.float
+    # Number of experts per rank
+    ne = 8
+    nsplits = ne * group.size()
+
+    # Number of elements for an expert is random between [0, k)
+    k = 10
+    inp_splits = torch.randint(k, (nsplits,), dtype=torch.int64, device=device)
+
+    # Actual number of input elements
+    inp_numel = inp_splits.sum().item()
+    # Max number of input elements (must be a constant across ranks for symmetric memory allocation)
+    max_inp_numel = k * nsplits
+    # Max number of output elements (must be a constant across ranks for symmetric memory allocation)
+    overflow_factor = group.size()  # worst case: one rank receives all data
+    max_out_numel = max_inp_numel * overflow_factor
+
+    # Buffers for shuffle
+    inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=device).copy_(
+        torch.randn(max_inp_numel, dtype=dtype, device=device)
+    )
+    out = symm_mem.empty(max_out_numel, dtype=dtype, device=device).fill_(-1)
+    in_splits = symm_mem.empty(nsplits, dtype=torch.int64, device=device).copy_(
+        inp_splits
+    )
+    # 2 rows: output splits, output offsets
+    # Initiallizing all values to -1 to check if they are updated
+    out_splits_offsets = symm_mem.empty(
+        (2, nsplits), dtype=torch.int64, device=device
+    ).fill_(-1)
+
+    # Buffers for combine
+    combine_out = symm_mem.empty(max_out_numel, dtype=dtype, device=device).fill_(-1)
+    # 2 rows: output splits, output offsets
+    # Initiallizing all values to -1 to check if they are updated
+    combine_out_splits_offsets = symm_mem.empty(
+        (2, nsplits), dtype=torch.int64, device=device
+    ).fill_(-1)
+
+    # Wait for all ranks to finish tensor allocation before accessing them
+    torch.cuda.synchronize(device)
+    dist.barrier(group=group)
+
+    # Shuffle the tokens
+    torch.ops.symm_mem.all_to_all_vdev_2d(
+        inp, out, in_splits, out_splits_offsets, group_name, major_align=align
+    )
 
-        # Number of elements for an expert is random between [0, k)
-        k = 10
-        inp_splits = torch.randint(k, (nsplits,), dtype=torch.int64, device=self.device)
+    # Combine the tokens
+    # `out_splits_offsets` from shuffle is exactly the `input_splits_offsets` for combine
+    # `out` data from shuffle is exactly the `input` data for combine
+    torch.ops.symm_mem.all_to_all_vdev_2d_offset(
+        out, combine_out, out_splits_offsets, combine_out_splits_offsets, group_name
+    )
 
-        # Exchange input splits to get output splits
-        out_splits = torch.zeros_like(inp_splits)
-        dist.all_to_all_single(out_splits, inp_splits)
+    # Assert the combined data is exactly the same as the original input data
+    torch.testing.assert_close(combine_out[:inp_numel], inp[:inp_numel])
 
-        # Actual number of input elements
-        inp_numel = inp_splits.sum().item()
-        # Max number of input elements (must be a constant across ranks for symmetric memory allocation)
-        max_inp_numel = k * nsplits
-        # Max number of output elements (must be a constant across ranks for symmetric memory allocation)
-        overflow_factor = self.world_size  # worst case: one rank receives all data
-        max_out_numel = max_inp_numel * overflow_factor
+    # Assert the combined out splits are exactly the same as the original input splits
+    torch.testing.assert_close(combine_out_splits_offsets[0], inp_splits)
 
-        # Buffers for shuffle
-        inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=self.device).fill_(
-            self.rank
-        )
-        out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
-        in_splits = symm_mem.empty(
-            nsplits, dtype=torch.int64, device=self.device
-        ).copy_(inp_splits)
-        # 2 rows: output splits, output offsets
-        # Initiallizing all values to -1 to check if they are updated
-        out_splits_offsets = symm_mem.empty(
-            (2, nsplits), dtype=torch.int64, device=self.device
-        ).fill_(-1)
-
-        # Shuffle the tokens
-        torch.ops.symm_mem.all_to_all_vdev_2d(
-            inp, out, in_splits, out_splits_offsets, group_name, major_align=align
-        )
-
-        # Buffers for combine
-        combine_out = symm_mem.empty(
-            max_out_numel, dtype=dtype, device=self.device
-        ).fill_(-1)
-        # 2 rows: output splits, output offsets
-        # Initiallizing all values to -1 to check if they are updated
-        combine_out_splits_offsets = symm_mem.empty(
-            (2, nsplits), dtype=torch.int64, device=self.device
-        ).fill_(-1)
+    # Assert the combined out offsets are exactly the same as the original input offsets
+    inp_offsets = torch.cumsum(inp_splits, dim=0)  # inclusive scan
+    # Make it exclusive scan because that's what `all_to_all_vdev_2d_offset` returns
+    inp_offsets = torch.cat([torch.zeros(1, device=device), inp_offsets[:-1]]).to(
+        torch.int64
+    )
+    torch.testing.assert_close(combine_out_splits_offsets[1], inp_offsets)
 
-        # Combine the tokens
-        # `out_splits_offsets` from shuffle is exactly the `input_splits_offsets` for combine
-        # `out` data from shuffle is exactly the `input` data for combine
-        torch.ops.symm_mem.all_to_all_vdev_2d_offset(
-            out, combine_out, out_splits_offsets, combine_out_splits_offsets, group_name
-        )
+    # Wait for all ranks to finish accessing tensors before freeing them
+    dist.barrier(group=group)
+    torch.cuda.synchronize(device)
 
-        # Assert the combined data is exactly the same as the original input data
-        torch.testing.assert_close(combine_out[:inp_numel], inp[:inp_numel])
 
-        # Assert the combined out splits are exactly the same as the original input splits
-        torch.testing.assert_close(combine_out_splits_offsets[0], inp_splits)
+@instantiate_parametrized_tests
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class DispatchCombineTest(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
 
-        # Assert the combined out offsets are exactly the same as the original input offsets
-        inp_offsets = torch.cumsum(inp_splits, dim=0)  # inclusive scan
-        # Make it exclusive scan because that's what `all_to_all_vdev_2d_offset` returns
-        inp_offsets = torch.cat(
-            [torch.zeros(1, device=self.device), inp_offsets[:-1]]
-        ).to(torch.int64)
-        torch.testing.assert_close(combine_out_splits_offsets[1], inp_offsets)
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
 
     @skipIfRocm
     @parametrize("align", [1, 8, 16])  # `major_align` of output
@@ -634,7 +661,22 @@ def test_dispatch_combine(self, align: int) -> None:
         """
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        self.helper_test_dispatch_combine(align, dist.group.WORLD.group_name)
+        dispatch_then_combine(self.device, align, dist.group.WORLD)
+
+
+@instantiate_parametrized_tests
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class DispatchCombineInSubgroups(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
 
     @skipIfRocm
     # TODO: FIXIT. Currently, `MultiProcContinuousTest` treats the skip code as a
@@ -654,7 +696,7 @@ def test_dispatch_combine_subgroup(self) -> None:
             device_type, (ngroups, subgroup_size), mesh_dim_names=("dp", "ep")
         )
         subgroup = dm.get_group("ep")
-        self.helper_test_dispatch_combine(align=8, group_name=subgroup.group_name)
+        dispatch_then_combine(self.device, align=8, group=subgroup)
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_nvshmem_triton.py b/test/distributed/test_nvshmem_triton.py
index 9306852498ff5..7db4509f162be 100644
--- a/test/distributed/test_nvshmem_triton.py
+++ b/test/distributed/test_nvshmem_triton.py
@@ -63,22 +63,20 @@ def nvshmem_get_kernel(
 
 @triton.jit
 def nvshmem_putmem_signal_block_kernel(
-    dst_ptr,
-    src_ptr,
+    dst,
+    src,
     size_bytes,
-    sig_ptr,
-    signal_val,
+    signal,
+    sig_val,
     sig_op,
     peer,
 ):
-    nvshmem.putmem_signal_block(
-        dst_ptr, src_ptr, size_bytes, sig_ptr, signal_val, sig_op, peer
-    )
+    nvshmem.putmem_signal_block(dst, src, size_bytes, signal, sig_val, sig_op, peer)
 
 
 @triton.jit
-def nvshmem_signal_wait_until_kernel(sig_ptr, cmp_op, cmp_val):
-    nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
+def nvshmem_signal_wait_until_kernel(signal, cmp_op, cmp_val):
+    nvshmem.signal_wait_until(signal, cmp_op, cmp_val)
 
 
 @triton.jit
@@ -419,7 +417,7 @@ def test_triton_put_signal_set(self) -> None:
         val = 11
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
         out_hdl = symm_mem.rendezvous(out, group=group_name)
 
         # Use the signal pad attached to the output symmetric memory handle
@@ -433,15 +431,12 @@ def test_triton_put_signal_set(self) -> None:
 
         if rank == 0:
             # Rank 0 puts into Rank 1
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            sig_ptr = out_hdl.signal_pad_ptrs[peer]
             nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
+                out,
+                inp,
                 size_bytes=msg_size_bytes,
-                sig_ptr=sig_ptr,
-                signal_val=SIGNAL_VAL,
+                signal=flag,
+                sig_val=SIGNAL_VAL,
                 sig_op=NVSHMEM_SIGNAL_SET,
                 peer=peer,
                 extern_libs=nvshmem_lib,
@@ -449,9 +444,8 @@ def test_triton_put_signal_set(self) -> None:
 
         if rank == 1:
             # Wait until signal flag is set by Rank 0
-            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
             nvshmem_signal_wait_until_kernel[(1,)](
-                sig_ptr_local,
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=SIGNAL_VAL,
                 extern_libs=nvshmem_lib,
@@ -485,7 +479,7 @@ def test_triton_put_signal_add(self) -> None:
         val = 11
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
         out_hdl = symm_mem.rendezvous(out, group=group_name)
 
         # Use the signal pad attached to the output symmetric memory handle
@@ -499,24 +493,20 @@ def test_triton_put_signal_add(self) -> None:
 
         if rank == 0:
             # Rank 0 puts into Rank 1
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            sig_ptr = out_hdl.signal_pad_ptrs[peer]
             nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
+                out,
+                inp,
                 size_bytes=msg_size_bytes,
-                sig_ptr=sig_ptr,
-                signal_val=SIGNAL_VAL,
+                signal=flag,
+                sig_val=SIGNAL_VAL,
                 sig_op=NVSHMEM_SIGNAL_ADD,
                 peer=peer,
                 extern_libs=nvshmem_lib,
             )
 
         if rank == 1:
-            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
             nvshmem_signal_wait_until_kernel[(1, 1, 1)](
-                sig_ptr_local,
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=SIGNAL_VAL,
                 extern_libs=nvshmem_lib,
@@ -546,10 +536,13 @@ def test_triton_wait_until(self) -> None:
         FLAG_FINAL_VALUE = 42
 
         # Use a single int64 symmetric tensor as our synchronization flag.
-        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(
+        flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(
             FLAG_INITIAL_VALUE
         )
         symm_mem.rendezvous(flag, group=group_name)
+        expected_flag = torch.tensor(
+            [FLAG_FINAL_VALUE], dtype=torch.int32, device=self.device
+        )
 
         nvshmem_barrier_all_kernel[(1,)](extern_libs=nvshmem_lib)
 
@@ -565,19 +558,15 @@ def test_triton_wait_until(self) -> None:
             # Verification
             torch.testing.assert_close(
                 flag,
-                torch.tensor([FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device),
+                expected_flag,
             )
 
         if rank == 1:
             # Rank 1 (the signaler)
-            val_to_put = torch.tensor(
-                [FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device
-            )
-
             # Launch a kernel to put the value to Rank 0's flag tensor.
             nvshmem_put_kernel[(1,)](
                 flag,  # Destination symmetric tensor on the remote PE
-                val_to_put,  # Source data tensor (local)
+                expected_flag,  # Source data tensor (local)
                 1,  # Number of elements
                 peer,  # The target PE (Rank 0)
                 extern_libs=nvshmem_lib,
@@ -609,7 +598,7 @@ def test_triton_signal_wait_until(self) -> None:
 
         # Producer (rank 0) prepares the data to send
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val_to_put)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
         # Consumer (rank 1) prepares the destination buffer
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
         out_hdl = symm_mem.rendezvous(out, group=group_name)
@@ -619,24 +608,20 @@ def test_triton_signal_wait_until(self) -> None:
 
         if rank == 0:
             # Producer (rank 0): Puts data into rank 1's `out` buffer and then sets the flag
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            sig_ptr = out_hdl.signal_pad_ptrs[peer]
             nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
+                out,
+                inp,
                 size_bytes=msg_size_bytes,
-                sig_ptr=sig_ptr,
-                signal_val=COMPLETION_FLAG_VAL,
+                signal=flag,
+                sig_val=COMPLETION_FLAG_VAL,
                 sig_op=NVSHMEM_SIGNAL_SET,
                 peer=peer,
                 extern_libs=nvshmem_lib,
             )
         elif rank == 1:
             # Consumer (rank 1): Waits on the signal variable using `signal_wait_until`.
-            sig_ptr = out_hdl.signal_pad_ptrs[rank]
             nvshmem_signal_wait_until_kernel[(1, 1, 1)](
-                sig_ptr,
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=COMPLETION_FLAG_VAL,
                 extern_libs=nvshmem_lib,
@@ -689,10 +674,10 @@ def test_triton_fence(self) -> None:
         symm_mem.rendezvous(out2, group=group_name)
 
         # Use regular symmetric memory tensor for flag
-        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
+        flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
         symm_mem.rendezvous(flag, group=group_name)
         flag_update_val = torch.tensor(
-            [flag_val], dtype=torch.int64, device=self.device
+            [flag_val], dtype=torch.int32, device=self.device
         )
         NVSHMEM_CMP_EQ = 0  # compare equal
 
@@ -725,7 +710,7 @@ def test_triton_fence(self) -> None:
                 out2, val2 * torch.ones(numel, dtype=dtype, device=self.device)
             )
             torch.testing.assert_close(
-                flag, torch.tensor([flag_val], dtype=torch.int64, device=self.device)
+                flag, torch.tensor([flag_val], dtype=torch.int32, device=self.device)
             )
 
     @skipIfRocm
@@ -747,9 +732,9 @@ def test_triton_quiet(self) -> None:
 
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
+        flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
         flag_update_val = torch.tensor(
-            [flag_val], dtype=torch.int64, device=self.device
+            [flag_val], dtype=torch.int32, device=self.device
         )
 
         symm_mem.rendezvous(inp, group=group_name)
@@ -982,7 +967,7 @@ def test_triton_broadcast(self) -> None:
             torch.uint8,
             torch.float16,
             torch.float32,
-            torch.float64,
+            # torch.float64,  # Tensor-likes are not close
             torch.bfloat16,
         ],
     )
@@ -1135,7 +1120,7 @@ def test_triton_minmax_reduce(self, dtype) -> None:
             torch.int64,
             torch.float16,
             torch.float32,
-            torch.float64,
+            # torch.float64,  # Tensor-likes are not close
             torch.bfloat16,
         ],
     )
diff --git a/test/distributed/test_run.py b/test/distributed/test_run.py
new file mode 100644
index 0000000000000..659241dbcbe99
--- /dev/null
+++ b/test/distributed/test_run.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# Owner(s): ["oncall: r2p"]
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from unittest.mock import MagicMock, patch
+
+import torch.distributed.run as run
+from torch.distributed.launcher.api import launch_agent, LaunchConfig
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class RunTest(TestCase):
+    def setUp(self):
+        # Save original environment variable if it exists
+        self.original_signals_env = os.environ.get(
+            "TORCHELASTIC_SIGNALS_TO_HANDLE", None
+        )
+
+    def tearDown(self):
+        # Restore original environment variable
+        if self.original_signals_env is not None:
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = self.original_signals_env
+        elif "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+    def test_signals_to_handle_default(self):
+        """Test that the default value for signals_to_handle is correctly set."""
+        parser = run.get_args_parser()
+        args = parser.parse_args(["dummy_script.py"])
+        self.assertEqual(args.signals_to_handle, "SIGTERM,SIGINT,SIGHUP,SIGQUIT")
+
+    def test_signals_to_handle_custom(self):
+        """Test that a custom value for signals_to_handle is correctly parsed."""
+        parser = run.get_args_parser()
+        args = parser.parse_args(
+            ["--signals-to-handle=SIGTERM,SIGUSR1,SIGUSR2", "dummy_script.py"]
+        )
+        self.assertEqual(args.signals_to_handle, "SIGTERM,SIGUSR1,SIGUSR2")
+
+    def test_config_from_args_signals_to_handle(self):
+        """Test that the signals_to_handle argument is correctly passed to LaunchConfig."""
+        parser = run.get_args_parser()
+        args = parser.parse_args(
+            ["--signals-to-handle=SIGTERM,SIGUSR1,SIGUSR2", "dummy_script.py"]
+        )
+        config, _, _ = run.config_from_args(args)
+        self.assertEqual(config.signals_to_handle, "SIGTERM,SIGUSR1,SIGUSR2")
+
+    @patch("torch.distributed.launcher.api.LocalElasticAgent")
+    @patch("torch.distributed.launcher.api.rdzv_registry.get_rendezvous_handler")
+    def test_launch_agent_sets_environment_variable(self, mock_get_handler, mock_agent):
+        """Test that launch_agent sets the TORCHELASTIC_SIGNALS_TO_HANDLE environment variable."""
+        # Setup
+        config = LaunchConfig(
+            min_nodes=1,
+            max_nodes=1,
+            nproc_per_node=1,
+            signals_to_handle="SIGTERM,SIGUSR1,SIGUSR2",
+        )
+        entrypoint = "dummy_script.py"
+        args = []
+
+        # Make sure the environment variable doesn't exist before the test
+        if "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+        # Mock agent.run() to return a MagicMock
+        mock_agent_instance = MagicMock()
+        mock_agent_instance.run.return_value = MagicMock(
+            is_failed=lambda: False, return_values={}
+        )
+        mock_agent.return_value = mock_agent_instance
+
+        # Call launch_agent
+        launch_agent(config, entrypoint, args)
+
+        # Verify that the environment variable was set correctly
+        self.assertEqual(
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"], "SIGTERM,SIGUSR1,SIGUSR2"
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index 870805eec75e8..e557a48359623 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -54,6 +54,8 @@
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 def gpus_for_rank(world_size):
     """Multigpu tests are designed to simulate the multi nodes with multi
@@ -61,8 +63,8 @@ def gpus_for_rank(world_size):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    visible_devices = list(range(torch.cuda.device_count()))
-    gpus_per_process = torch.cuda.device_count() // world_size
+    visible_devices = list(range(torch.accelerator.device_count()))
+    gpus_per_process = torch.accelerator.device_count() // world_size
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
@@ -1174,8 +1176,8 @@ def listen() -> None:
 
 
 if __name__ == "__main__":
-    assert not torch.cuda._initialized, (
-        "test_distributed must not have initialized CUDA context on main process"
-    )
-
+    if device_type != "cpu":
+        assert not torch.get_device_module()._initialized, (
+            "test_distributed must not have initialized {device_type} context on main process"
+        )
     run_tests()
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index aef7130c04aa3..fd58c4f7b809e 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -27,18 +27,18 @@
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
     MultiProcessTestCase,
+    PLATFORM_SUPPORTS_SYMM_MEM,
     requires_multicast_support,
     skip_if_lt_x_gpu,
+    skip_if_rocm_multiprocess,
+    skip_if_rocm_ver_lessthan_multiprocess,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
-    MI300_ARCH,
     parametrize,
     requires_cuda,
     requires_cuda_p2p_access,
     run_tests,
-    runOnRocmArch,
-    skipIfRocm,
     TEST_WITH_ROCM,
     TestCase,
 )
@@ -68,7 +68,9 @@ def test_has_multicast_support(self) -> None:
         self.assertFalse(_SymmetricMemory.has_multicast_support(DeviceType.CPU, 0))
         # NOTE: DeviceType.CUDA is implicitly tested through @requires_multicast_support
 
-    @skipIfRocm
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(2)
     def test_get_backend(self) -> None:
         backend = symm_mem.get_backend(torch.device("cuda"))
@@ -76,7 +78,7 @@ def test_get_backend(self) -> None:
         backend = symm_mem.get_backend("cuda")
         self.assertIsNotNone(backend)
 
-    @skipIfRocm
+    @skip_if_rocm_multiprocess
     @skip_if_lt_x_gpu(2)
     def test_cuda_nvlink_connectivity_detection(self) -> None:
         from torch._C._distributed_c10d import _detect_dma_connectivity
@@ -88,12 +90,16 @@ def test_cuda_nvlink_connectivity_detection(self) -> None:
         for row in connectivity.matrix:
             self.assertEqual(len(row), torch.cuda.device_count())
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     def test_large_alloc(self) -> None:
         t = symm_mem.empty(2 * 1024**3, dtype=torch.uint8, device="cuda")
         self.assertEqual(t.numel() * t.element_size(), 2 * 1024**3)
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(2)
     def test_get_signal_pad(self) -> None:
         self._init_process()
@@ -134,7 +140,9 @@ def test_get_signal_pad(self) -> None:
         t.fill_(0)
         self.assertTrue(signal_pad.eq(42).all())
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @requires_cuda
     def test_allow_overlapping_devices(self) -> None:
         os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "1"
@@ -153,7 +161,9 @@ def test_allow_overlapping_devices(self) -> None:
 
         os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "0"
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(2)
     @parametrize("symm_mem_input", [True, False])
     def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
@@ -178,7 +188,9 @@ def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
         for r in range(self.world_size):
             self.assertTrue(chunks[r].eq(r).all())
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(2)
     @parametrize("reduce_op", ["sum", "avg"])
     @parametrize("symm_mem_input", [True, False])
@@ -214,7 +226,9 @@ def test_low_contention_reduce_scatter(
             raise AssertionError(f"Unexpected reduce_op: {reduce_op}")
         self.assertTrue(res.eq(expect).all())
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(4)
     def test_subgroup(self) -> None:
         self._init_process()
@@ -271,7 +285,9 @@ def _init_process(self):
         torch.set_deterministic_debug_mode("warn")
         torch.utils.deterministic.fill_uninitialized_memory = True
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(2)
     @parametrize("gather_dim", [0, 1])
     def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
@@ -301,7 +317,7 @@ def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
             assert torch.allclose(mm_output_0, mm_output_1)
             assert mm_output_0.stride(), mm_output_1.stride()
 
-    @skipIfRocm  # this requires async_input_mm support
+    @skip_if_rocm_multiprocess  # this requires async_input_mm support
     @skipIf(
         not SM90OrLater,
         "_fused_all_gather_matmul_native currently only supports sm>=90",
@@ -398,7 +414,9 @@ def test_multimem_all_gather_matmul(self) -> None:
         torch.testing.assert_close(ag_target, ag_baseline)
         torch.testing.assert_close(mm_target[0], mm_baseline[0])
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(2)
     @parametrize("gather_dim", [0, 1])
     @parametrize(
@@ -484,7 +502,9 @@ def test_fused_all_gather_scaled_matmul(
             self.assertEqual(mm_output_0.stride(), mm_output_1.stride())
             self.assertEqual(mm_output_0.dtype, mm_output_1.dtype)
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(2)
     @parametrize("scatter_dim", [0, 1])
     def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
@@ -511,7 +531,7 @@ def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
         assert torch.allclose(output_0, output_1)
         assert output_0.stride() == output_1.stride()
 
-    @skipIfRocm  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
+    @skip_if_rocm_multiprocess  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
     @skip_if_lt_x_gpu(2)
     @parametrize("scatter_dim", [0, 1])
     @parametrize("rowwise", [True, False])
@@ -561,7 +581,9 @@ def test_fused_scaled_matmul_reduce_scatter(
         assert outputs[0].stride() == outputs[1].stride()
         self.assertEqual(outputs[0], outputs[1])
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @parametrize("dim", [0, 1, 2])
     def test_optimal_layout(self, dim: int) -> None:
         t = torch.rand(8, 64, 32, 16)
@@ -645,7 +667,9 @@ def _verify_symmetric_memory(self, symm_mem_hdl):
 
         symm_mem_hdl.barrier()
 
-    @skipIfRocm
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(2)
     @parametrize("set_device", [True, False])
     def test_empty_strided_p2p(self, set_device: bool) -> None:
@@ -664,7 +688,10 @@ def test_empty_strided_p2p(self, set_device: bool) -> None:
         del t
         self._verify_symmetric_memory(symm_mem_hdl)
 
-    @skipIfRocm  # started failing during ROCm 6.4 CI upgrade
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
+    @skip_if_rocm_ver_lessthan_multiprocess((7, 0))
     @skip_if_lt_x_gpu(2)
     @parametrize("set_device", [True, False])
     def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
@@ -733,7 +760,7 @@ def _init_process(self):
     # the linux kernel to create a core dump of the host application. The functionality
     # is there, meaning timeout is happening correctly. However, there isn't a nice way
     # to test it as the current executing thread will coredump and exit.
-    @skipIfRocm
+    @skip_if_rocm_multiprocess
     @skip_if_lt_x_gpu(2)
     def test_barrier_timeout(self) -> None:
         self._init_process()
@@ -759,7 +786,7 @@ def test_barrier_timeout(self) -> None:
     # the linux kernel to create a core dump of the host application. The functionality
     # is there, meaning timeout is happening correctly. However, there isn't a nice way
     # to test it as the current executing thread will coredump and exit.
-    @skipIfRocm
+    @skip_if_rocm_multiprocess
     @skip_if_lt_x_gpu(2)
     def test_put_signal_timeout(self) -> None:
         self._init_process()
@@ -788,7 +815,7 @@ def test_put_signal_timeout(self) -> None:
     # the linux kernel to create a core dump of the host application. The functionality
     # is there, meaning timeout is happening correctly. However, there isn't a nice way
     # to test it as the current executing thread will coredump and exit.
-    @skipIfRocm
+    @skip_if_rocm_multiprocess
     @skip_if_lt_x_gpu(2)
     def test_wait_signal_timeout(self) -> None:
         self._init_process()
@@ -877,7 +904,9 @@ def test_multimem_one_shot_all_reduce(
             gathered_inps.sum(dim=0), res, rtol=1e-03, atol=1e-05
         )
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(4)
     def test_one_shot_all_reduce(self) -> None:
         self._init_process()
@@ -908,7 +937,9 @@ def test_one_shot_all_reduce(self) -> None:
                 )
             self._verify_all_reduce_result(local_inp if copy else inp[offset:], res)
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(4)
     def test_two_shot_all_reduce(self) -> None:
         self._init_process()
@@ -958,7 +989,9 @@ def _verify_all_reduce_result(self, inp, res):
             gathered_inps.sum(dim=0), res, rtol=1e-01, atol=1e-01
         )
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(4)
     def test_reduce_scatter(self) -> None:
         self._init_process()
@@ -995,7 +1028,9 @@ def test_reduce_scatter(self) -> None:
             self.assertTrue(t[shift + numel :].eq(0).all().item())
             self._verify_reduce_scatter_result(inp, out)
 
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @skip_if_lt_x_gpu(4)
     def test_reduce_scatter_corner_cases(self) -> None:
         self._init_process()
@@ -1072,7 +1107,7 @@ def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
     @skip("Fails with 'one_shot_all_reduce' not found in AOT graph, TODO: fix")
-    @skipIfRocm  # requires registered-buffer support
+    @skip_if_rocm_multiprocess  # requires registered-buffer support
     @skip_if_lt_x_gpu(2)
     @fresh_cache()
     def test_lowering_one_shot_all_reduce(self):
@@ -1132,7 +1167,9 @@ class SymmMemSingleProcTest(TestCase):
         not TEST_WITH_ROCM and _get_torch_cuda_version() < (12, 0),
         "stream_write_value32 currently only supports cuda version>=12.0",
     )
-    @runOnRocmArch(MI300_ARCH)
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     def test_stream_write_value32(self):
         tensor = torch.zeros(4, dtype=torch.uint32, device="cuda")
         expect = torch.tril(torch.ones(4, 4, device="cuda")).to(torch.uint32)
@@ -1147,8 +1184,10 @@ def test_stream_write_value32(self):
         with self.assertRaises(RuntimeError):
             _SymmetricMemory.stream_write_value32(tensor, offset=0, val=4294967296)
 
+    @skipIf(
+        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
+    )
     @requires_cuda
-    @runOnRocmArch(MI300_ARCH)
     def test_memset32(self):
         t = _SymmetricMemory.empty_strided_p2p(
             (64,),
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 7cb8cc678136f..aaae775f191cf 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -115,10 +115,13 @@
     set_default_dtype,
     set_rng_seed,
     skipIfTorchDynamo,
+    TEST_XPU,
     TestCase,
 )
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -1788,18 +1791,21 @@ def test_negative_binomial_log_prob_vectorized_count(self):
             ).logpmf(sample)
             self.assertEqual(log_prob, expected, atol=1e-4, rtol=0)
 
-    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
     def test_zero_excluded_binomial(self):
         vals = Binomial(
-            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.9).cuda()
+            total_count=torch.tensor(1.0).to(device_type),
+            probs=torch.tensor(0.9).to(device_type),
         ).sample(torch.Size((100000000,)))
         self.assertTrue((vals >= 0).all())
         vals = Binomial(
-            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.1).cuda()
+            total_count=torch.tensor(1.0).to(device_type),
+            probs=torch.tensor(0.1).to(device_type),
         ).sample(torch.Size((100000000,)))
         self.assertTrue((vals < 2).all())
         vals = Binomial(
-            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.5).cuda()
+            total_count=torch.tensor(1.0).to(device_type),
+            probs=torch.tensor(0.5).to(device_type),
         ).sample(torch.Size((10000,)))
         # vals should be roughly half zeroes, half ones
         assert (vals == 0.0).sum() > 4000
@@ -2050,15 +2056,15 @@ def test_poisson_sample(self):
                 )
         torch.set_default_dtype(saved_dtype)
 
-    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_poisson_gpu_sample(self):
         set_rng_seed(1)
         for rate in [0.12, 0.9, 4.0]:
             self._check_sampler_discrete(
-                Poisson(torch.tensor([rate]).cuda()),
+                Poisson(torch.tensor([rate]).to(device_type)),
                 scipy.stats.poisson(rate),
-                f"Poisson(lambda={rate}, cuda)",
+                f"Poisson(lambda={rate}, {device_type})",
                 failure_rate=1e-3,
             )
 
@@ -3490,13 +3496,13 @@ def ref_log_prob(idx, x, log_prob):
 
         self._check_log_prob(Gamma(alpha, beta), ref_log_prob)
 
-    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gamma_gpu_shape(self):
-        alpha = torch.randn(2, 3).cuda().exp().requires_grad_()
-        beta = torch.randn(2, 3).cuda().exp().requires_grad_()
-        alpha_1d = torch.randn(1).cuda().exp().requires_grad_()
-        beta_1d = torch.randn(1).cuda().exp().requires_grad_()
+        alpha = torch.randn(2, 3).to(device_type).exp().requires_grad_()
+        beta = torch.randn(2, 3).to(device_type).exp().requires_grad_()
+        alpha_1d = torch.randn(1).to(device_type).exp().requires_grad_()
+        beta_1d = torch.randn(1).to(device_type).exp().requires_grad_()
         self.assertEqual(Gamma(alpha, beta).sample().size(), (2, 3))
         self.assertEqual(Gamma(alpha, beta).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Gamma(alpha_1d, beta_1d).sample((1,)).size(), (1, 1))
@@ -3527,7 +3533,10 @@ def test_gamma_sample(self):
     def test_gamma_gpu_sample(self):
         set_rng_seed(0)
         for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
-            a, b = torch.tensor([alpha]).cuda(), torch.tensor([beta]).cuda()
+            a, b = (
+                torch.tensor([alpha]).to(device_type),
+                torch.tensor([beta]).to(device_type),
+            )
             self._check_sampler_sampler(
                 Gamma(a, b),
                 scipy.stats.gamma(alpha, scale=1.0 / beta),
@@ -3973,11 +3982,11 @@ def test_beta_underflow(self):
             self.assertEqual(frac_zeros, 0.5, atol=0.05, rtol=0)
             self.assertEqual(frac_ones, 0.5, atol=0.05, rtol=0)
 
-    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
     def test_beta_underflow_gpu(self):
         set_rng_seed(1)
         num_samples = 50000
-        conc = torch.tensor(1e-2, dtype=torch.float64).cuda()
+        conc = torch.tensor(1e-2, dtype=torch.float64).to(device_type)
         beta_samples = Beta(conc, conc).sample([num_samples])
         self.assertEqual((beta_samples == 0).sum(), 0)
         self.assertEqual((beta_samples == 1).sum(), 0)
diff --git a/test/dynamo/test_aot_compile.py b/test/dynamo/test_aot_compile.py
index 6589428bda6c6..fb87aca71013a 100644
--- a/test/dynamo/test_aot_compile.py
+++ b/test/dynamo/test_aot_compile.py
@@ -2,6 +2,7 @@
 
 import os
 import pickle
+from contextlib import contextmanager
 
 import torch
 import torch._dynamo.testing
@@ -9,6 +10,7 @@
 import torch._inductor.test_case
 import torch.onnx.operators
 import torch.utils.cpp_extension
+from torch._dynamo.aot_compile import ModelInput, SerializableCallable
 from torch._dynamo.exc import PackageError, Unsupported
 from torch._dynamo.package import DynamoCache
 from torch._dynamo.precompile_context import PrecompileContext
@@ -17,6 +19,9 @@
 from torch.testing._internal.common_utils import instantiate_parametrized_tests
 
 
+MY_LAMBDA = lambda x: x + 1  # noqa: E731
+
+
 class CustomCompiledFunction(torch._dynamo.aot_compile.SerializableCallable):
     def __init__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]):
         self.gm = gm
@@ -141,6 +146,28 @@ def backend(gm, example_inputs):
             actual = compiled_fn(*example_inputs)
             self.assertEqual(expected, actual)
 
+    def test_aot_compile_source_info(self):
+        from torch._dynamo.package import SourceInfo
+
+        def fn(x, y):
+            return MY_LAMBDA(x) + y
+
+        compiled_fn = torch.compile(fn, fullgraph=True).aot_compile(
+            ((torch.randn(3, 4), torch.randn(3, 4)), {})
+        )
+
+        source_info = compiled_fn.source_info()
+        self.assertIsInstance(source_info, SourceInfo)
+        self.assertEqual(len(source_info.inlined_sources), 2)
+        self.assertEqual(next(iter(source_info.inlined_sources)).module, __name__)
+        compiled_fn.save_compiled_function(self.path())
+        with open(self.path(), "rb") as f:
+            compiled_fn = torch.compiler.load_compiled_function(f)
+        source_info = compiled_fn.source_info()
+        self.assertIsInstance(source_info, SourceInfo)
+        self.assertEqual(len(source_info.inlined_sources), 2)
+        self.assertEqual(next(iter(source_info.inlined_sources)).module, __name__)
+
     def test_aot_compile_graph_break_error_fmt(self):
         def foo(x, y):
             a = x + x
@@ -226,6 +253,117 @@ def fn(x, y):
             actual = compiled_fn(*inputs)
             self.assertEqual(expected, actual)
 
+    def test_aot_compile_module(self):
+        mod = SimpleLinearModule()
+
+        model = torch.compile(
+            mod,
+            fullgraph=True,
+            backend="inductor",
+            options={
+                "guard_filter_fn": torch.compiler.skip_guard_on_globals_unsafe,
+            },
+        )
+
+        @contextmanager
+        def train_mode(model):
+            """
+            Context manager that sets the model to training mode before entering the context.
+            """
+            model.train()
+            yield
+
+        @contextmanager
+        def eval_mode(model):
+            """
+            Context manager that sets the model to evaluation mode before entering the context.
+            """
+            model.eval()
+            yield
+
+        inputs = [
+            ModelInput(
+                args=(torch.randn(3, 3),),
+                kwargs={},
+                contexts=[torch.no_grad(), eval_mode(model)],
+            ),
+            ModelInput(
+                args=(torch.randn(3, 3),), kwargs={}, contexts=[train_mode(model)]
+            ),
+        ]
+        assert isinstance(model, torch._dynamo.eval_frame.OptimizedModule)
+        model._aot_compile(
+            inputs,
+        )
+        with torch.compiler.set_stance("fail_on_recompile"):
+            model.eval()
+            inputs = (torch.randn(3, 3),)
+            expected = mod(*inputs)
+            actual = model(*inputs)
+            self.assertEqual(expected, actual)
+
+            # Shouldn't recompile
+            model.train()
+            expected.sum().backward()
+
+        model._save_aot_compiled_module(self.path())
+        torch._dynamo.reset()
+        model = torch.compile(
+            mod,
+            fullgraph=True,
+            backend="inductor",
+            options={
+                "guard_filter_fn": torch.compiler.skip_guard_on_globals_unsafe,
+            },
+        )
+        assert isinstance(model, torch._dynamo.eval_frame.OptimizedModule)
+        with open(self.path(), "rb") as f:
+            data = f.read()
+            model._load_aot_compiled_module(data)
+
+        with torch.compiler.set_stance("fail_on_recompile"):
+            model.eval()
+            inputs = (torch.randn(3, 3),)
+            expected = mod(*inputs)
+            actual = model(*inputs)
+            self.assertEqual(expected, actual)
+
+            # Shouldn't recompile
+            model.train()
+            expected.sum().backward()
+
+    def test_aot_module_simplified_serializable_autograd(self):
+        mod = SimpleLinearModule()
+        compiled_fn: SerializableCallable = torch.compile(
+            mod, fullgraph=True, backend="inductor"
+        ).forward.aot_compile(((torch.randn(3, 3),), {}))
+        backend_result = compiled_fn._artifacts.compiled_fn
+        self.assertTrue(
+            isinstance(
+                backend_result,
+                torch._dynamo.aot_compile.BundledAOTAutogradSerializableCallable,
+            )
+        )
+        assert hasattr(backend_result.compiled_fn, "serialize")
+        self.assertIsNotNone(backend_result.compiled_fn.serialize)
+
+    def test_aot_module_simplified_serializable_inference(self):
+        def fn(x):
+            return x.sin()
+
+        compiled_fn: SerializableCallable = torch.compile(
+            fn, fullgraph=True, backend="inductor"
+        ).aot_compile(((torch.randn(3, 3),), {}))
+        backend_result = compiled_fn._artifacts.compiled_fn
+        self.assertTrue(
+            isinstance(
+                backend_result,
+                torch._dynamo.aot_compile.BundledAOTAutogradSerializableCallable,
+            )
+        )
+        assert hasattr(backend_result.compiled_fn, "serialize")
+        self.assertIsNotNone(backend_result.compiled_fn.serialize)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_backends.py b/test/dynamo/test_backends.py
index be1470c08e794..28579f727b05a 100644
--- a/test/dynamo/test_backends.py
+++ b/test/dynamo/test_backends.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: dynamo"]
-import sys
 import unittest
 from unittest.mock import MagicMock, patch
 
@@ -305,23 +304,15 @@ def test_lookup_custom_backend(self):
         backends_group = "torch_dynamo_backends"
         name = "mycustombackend"
 
-        mock_3_9 = MagicMock()
-        mock_3_9.load.return_value = lambda: "mocked 3.9"
-        mock_3_9.name = name
-
         mock_3_10 = MagicMock()
         mock_3_10.load.return_value = lambda: "mocked 3.10"
 
         def mock_eps(group=None):
-            if sys.version_info < (3, 10):
-                return {backends_group: [mock_3_9]}
-            else:
-                assert group == backends_group, group
-                mock_group = MagicMock()
-                mock_group.names = [name]
-                mock_group[name] = mock_3_10
-                # mock_group[name].load.return_value = lambda: "mocked 3.10"
-                return mock_group
+            assert group == backends_group, group
+            mock_group = MagicMock()
+            mock_group.names = [name]
+            mock_group[name] = mock_3_10
+            return mock_group
 
         with patch("importlib.metadata.entry_points", mock_eps):
             from torch._dynamo.backends import registry
@@ -386,7 +377,7 @@ def forward(self, x):
         self.assertTrue(backend_run)
 
 
-devices = ["cpu", "cuda", "hpu"]
+devices = ["cpu", "cuda", "hpu", "xpu"]
 instantiate_device_type_tests(TestOptimizations, globals(), only_for=devices)
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_callback.py b/test/dynamo/test_callback.py
index e516364626314..56ff5ff41e682 100644
--- a/test/dynamo/test_callback.py
+++ b/test/dynamo/test_callback.py
@@ -8,7 +8,12 @@
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._guards import CompileId
 from torch.testing._internal.common_utils import TEST_WITH_ROCM
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import HAS_CUDA_AND_TRITON, requires_gpu
+
+
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
+)
 
 
 class CallbackTests(TestCase):
@@ -61,7 +66,7 @@ def test_counter_assertion(self) -> None:
     @unittest.skipIf(
         TEST_WITH_ROCM, "ROCm outputs a different number of autotuning logs"
     )
-    @requires_cuda_and_triton
+    @requires_gpu
     @torch._inductor.config.patch(force_disable_caches=True)
     def test_triggers(self) -> None:
         torch._dynamo.reset()
@@ -91,9 +96,9 @@ def forward(self, x):
                 torch._dynamo.graph_break()
                 return self.fc2(temp)
 
-        model = TinyModel().to("cuda")
+        model = TinyModel().to(device_type)
         compiled_model = torch.compile(model, mode="max-autotune")
-        x = torch.randn(10, 10, device="cuda")
+        x = torch.randn(10, 10, device=device_type)
 
         loss = compiled_model(x).sum()
         loss.backward()
@@ -111,9 +116,13 @@ def forward(self, x):
         )
         order.clear()
 
+        if not HAS_CUDA_AND_TRITON:
+            return
+
         compiled_model.zero_grad()
         loss = compiled_model(x).sum()
         loss.backward()
+
         self.assertExpectedInline(
             "\n".join(order),
             """\
diff --git a/test/dynamo/test_compile.py b/test/dynamo/test_compile.py
index 1f7290c51dd8d..7df0ba2f1d3e4 100644
--- a/test/dynamo/test_compile.py
+++ b/test/dynamo/test_compile.py
@@ -234,6 +234,27 @@ def fn(x, y):
         with self.assertRaises(IndexError):
             fn(torch.randn(10), 99)
 
+    def test_list_bad_weakref(self):
+        import weakref
+
+        a = torch.Event()
+        with self.assertRaises(TypeError):
+            weakref.ref(a)
+
+        @torch.compile(backend="eager")
+        class Mod(torch.nn.Module):
+            def __init__(self, event):
+                super().__init__()
+                self.event = event
+
+            def forward(self, x):
+                return x * int(self.event.query())
+
+        e = torch.Event()
+        m = Mod(e)
+        a = torch.randn(10)
+        self.assertEqual(m(a), a)
+
 
 # The private variants of the below functions are extensively tested
 # So as long as the signatures match we're good
diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
index 847f3a6fd2166..172ced2a58a0a 100644
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@@ -48,27 +48,6 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 
 class ErrorMessagesTest(LoggingTestCase):
-    def test_dynamic_shape_operator(self):
-        def fn():
-            return torch.nonzero(torch.rand([10, 10]))
-
-        self.assertExpectedInlineMunged(
-            Unsupported,
-            lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
-            """\
-Dynamic shape operator
-  Explanation: Operator `aten.nonzero.default`'s output shape depends on input Tensor data.
-  Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`
-
-  Developer debug context: aten.nonzero.default
-
- For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0036.html
-
-from user code:
-   File "test_error_messages.py", line N, in fn
-    return torch.nonzero(torch.rand([10, 10]))""",
-        )
-
     def test_dynamic_shape_operator_no_meta_kernel(self):
         def fn():
             return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))
@@ -91,29 +70,6 @@ def fn():
     return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))""",
             )
 
-    def test_data_dependent_operator(self):
-        def fn(x):
-            return x.item()
-
-        self.assertExpectedInlineMunged(
-            Unsupported,
-            lambda: torch.compile(fn, backend="eager", fullgraph=True)(
-                torch.Tensor([1])
-            ),
-            """\
-Unsupported Tensor.item() call with capture_scalar_outputs=False
-  Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
-  Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
-
-  Developer debug context: call_method TensorVariable() item () {}
-
- For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html
-
-from user code:
-   File "test_error_messages.py", line N, in fn
-    return x.item()""",
-        )
-
     def test_data_dependent_operator2(self):
         def fn(x):
             return torch.equal(x, x)
@@ -726,14 +682,14 @@ class Foo:
             Unsupported,
             lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
             """\
-LOAD_BUILD_CLASS bytecode not supported
-  Explanation: Dynamo does not support tracing classes that are defined in the compiled region.
-  Hint: Move the class definition out of the compiled region.
-  Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
+Attempted to call function marked as skipped
+  Explanation: Dynamo does not know how to trace the builtin `builtins.__build_class__.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+  Hint: If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+  Hint: If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
 
-  Developer debug context:
+  Developer debug context: module: builtins, qualname: __build_class__, skip reason: <missing reason>
 
- For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0075.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 5b8aa5c61e405..cc1d0829948d3 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -40,11 +40,16 @@
     instantiate_parametrized_tests,
     parametrize,
 )
+from torch.testing._internal.inductor_utils import HAS_GPU
 
 # Defines all the kernels for tests
 from torch.testing._internal.triton_utils import *  # noqa: F403
 
 
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
+)
+
 T = TypeVar("T")
 
 d = torch.ones(10, 10)
@@ -1150,10 +1155,10 @@ def test_tensor_type(a, b):
         m = a.to(torch.float16)
         return b.type(m.type())
 
-    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @unittest.skipIf(not HAS_GPU, "requires gpu")
     @make_test
     def test_tensor_type2(a, b):
-        m = a.to("cuda")
+        m = a.to(device_type)
         return m + b.type(m.type())
 
     @make_test
@@ -2082,6 +2087,12 @@ def test_namedtuple_user_methods(a, b):
         mytuple = FunctionTests.MyNamedTuple(a, b)
         return mytuple.add(), mytuple.static_method(), mytuple.class_method()
 
+    @make_test
+    def test_namedtuple_replace(a, b):
+        mytuple = FunctionTests.MyNamedTuple(a, b)
+        replaced = mytuple._replace(first=b)
+        return mytuple.first + mytuple.second + replaced.first + replaced.second
+
     @make_test
     def test_generic_namedtuple_user_methods(a, b):
         mytuple = FunctionTests.MyGenericNamedTuple(a, b)
@@ -4040,7 +4051,7 @@ def test_torch_get_device_module(self):
         def f1():
             mod1 = torch.get_device_module()
             mod2 = torch.get_device_module("cpu")
-            mod3 = torch.get_device_module(torch.device("cuda"))
+            mod3 = torch.get_device_module(torch.device(device_type))
             return mod1, mod2, mod3
 
         self.assertEqual(f1(), torch.compile(f1, backend="eager", fullgraph=True)())
@@ -4075,6 +4086,7 @@ def f5():
         new_device = (
             "cpu" if torch._C._get_accelerator() == torch.device("cuda") else "cuda"
         )
+
         old_get_device_module = torch.get_device_module
 
         def new_get_device_module(device=None):
@@ -4721,10 +4733,12 @@ def fn(x, ys, zs):
             opt_fn(x, ys, zs[:1])
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
-    def test_cuda_current_device(self):
+    def test_gpu_current_device(self):
         def fn(x):
             y = torch.empty(
-                (2, 3), dtype=torch.float32, device=torch.cuda.current_device()
+                (2, 3),
+                dtype=torch.float32,
+                device=torch.accelerator.current_device_index(),
             )
             y.copy_(x)
             return torch.sin(y + y.device.index)
@@ -4732,11 +4746,11 @@ def fn(x):
         counter = torch._dynamo.testing.CompileCounter()
         opt_fn = torch.compile(backend=counter, fullgraph=True)(fn)
 
-        with torch.cuda.device(0):
+        with torch.accelerator.device_index(0):
             x = torch.randn(2, 3)
             self.assertEqual(opt_fn(x), fn(x))
             self.assertEqual(counter.frame_count, 1)
-            with torch.cuda.device(1):
+            with torch.accelerator.device_index(1):
                 self.assertEqual(opt_fn(x), fn(x))
                 self.assertEqual(counter.frame_count, 2)
 
diff --git a/test/dynamo/test_graph_region_tracker.py b/test/dynamo/test_graph_region_tracker.py
index e930ff787a9a4..ce456596fd55e 100644
--- a/test/dynamo/test_graph_region_tracker.py
+++ b/test/dynamo/test_graph_region_tracker.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: dynamo"]
 import contextlib
-import os
 
 import torch
 import torch.fx
@@ -196,21 +195,6 @@ def fn(x, y, z):
         )
 
     def test_mismatched_global_state(self):
-        @contextlib.contextmanager
-        def _hip_allow_tf32():
-            # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-            # and only for MI300+
-            hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-
-            try:
-                yield
-            finally:
-                if hip_allow_tf32 is not None:
-                    os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-                else:
-                    del os.environ["HIPBLASLT_ALLOW_TF32"]
-
         def inner_fn(x, y):
             x1 = x * 1
             y1 = y + 1
@@ -251,31 +235,29 @@ def set_default_dtype_bfloat16():
         def reset_default_dtype():
             torch.set_default_dtype(old_dtype)
 
-        tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-        with tf32_ctx():
-            for ctx in [
-                lambda: torch.set_grad_enabled(False),
-                torch.autograd.grad_mode.inference_mode,
-                lambda: torch.autograd.graph.disable_saved_tensors_hooks(
-                    "This is not supported"
-                ),
-                # lambda: torch.set_num_threads(2), : Unsupported
-                (set_default_dtype_bfloat16, reset_default_dtype),
-                (
-                    lambda: torch.use_deterministic_algorithms(True),
-                    lambda: torch.use_deterministic_algorithms(False),
-                ),
-                # (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
-                # lambda: torch.use_deterministic_algorithms(False)), : Unsupported
-                create_toggle_fns("allow_bf16_reduced_precision_reduction"),
-                create_toggle_fns("allow_fp16_reduced_precision_reduction"),
-                create_toggle_fns("allow_tf32"),
-            ]:
-                self.assertExpectedInline(
-                    self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
-                    """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
+        for ctx in [
+            lambda: torch.set_grad_enabled(False),
+            torch.autograd.grad_mode.inference_mode,
+            lambda: torch.autograd.graph.disable_saved_tensors_hooks(
+                "This is not supported"
+            ),
+            # lambda: torch.set_num_threads(2), : Unsupported
+            (set_default_dtype_bfloat16, reset_default_dtype),
+            (
+                lambda: torch.use_deterministic_algorithms(True),
+                lambda: torch.use_deterministic_algorithms(False),
+            ),
+            # (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
+            # lambda: torch.use_deterministic_algorithms(False)), : Unsupported
+            create_toggle_fns("allow_bf16_reduced_precision_reduction"),
+            create_toggle_fns("allow_fp16_reduced_precision_reduction"),
+            create_toggle_fns("allow_tf32"),
+        ]:
+            self.assertExpectedInline(
+                self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
+                """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
 [['x1', 'y1', 'sum_1', 'o4'], ['x1_1', 'y1_1', 'sum_2', 'o5']]]""",
-                )
+            )
 
     def test_mutation_tracking_simple(self):
         def fn(x, y, z):
diff --git a/test/dynamo/test_guard_serialization.py b/test/dynamo/test_guard_serialization.py
index e826492089f63..7e19de2973431 100644
--- a/test/dynamo/test_guard_serialization.py
+++ b/test/dynamo/test_guard_serialization.py
@@ -26,6 +26,7 @@
 from torch._dynamo.utils import dynamo_timed, get_metrics_context
 from torch._guards import compile_context, CompileContext, tracing
 from torch.overrides import TorchFunctionMode
+from torch.testing._internal.common_utils import IS_MACOS
 from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.utils import _pytree as pytree
 
@@ -54,6 +55,14 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
         return func(*args, **kwargs)
 
 
+class MyClass:
+    def __getstate__(self):
+        raise RuntimeError("Cannot pickle")
+
+    def add(self, x):
+        return x + 1
+
+
 class SubclassWithMeta(torch.Tensor):
     @staticmethod
     def __new__(cls, a, extra, outer_size=None, outer_stride=None):
@@ -235,17 +244,7 @@ def __hash__(self):
 pytree.register_constant(CustomConstantType)
 
 
-@torch._dynamo.config.patch({"strict_precompile": True})
-class TestGuardSerialization(torch._inductor.test_case.TestCase):
-    def test_function_locals(self):
-        def foo(x):
-            return x + 1
-
-        def fn(x, g):
-            return g(x) + 1
-
-        self._test_serialization("TENSOR_MATCH", fn, torch.randn(3), foo)
-
+class TestGuardSerializationBase(torch._inductor.test_case.TestCase):
     def _tracefunc(self, frame, event, arg):
         if event != "call":
             return
@@ -379,6 +378,18 @@ def _test_check_fn(self, ref, loaded, inputs, expected):
         self.assertEqual(ref.check(inputs), expected)
         self.assertEqual(ref.check(inputs), loaded.check(inputs))
 
+
+@torch._dynamo.config.patch({"strict_precompile": True})
+class TestGuardSerialization(TestGuardSerializationBase):
+    def test_function_locals(self):
+        def foo(x):
+            return x + 1
+
+        def fn(x, g):
+            return g(x) + 1
+
+        self._test_serialization("TENSOR_MATCH", fn, torch.randn(3), foo)
+
     def test_tensor_match(self):
         def f(x: torch.Tensor):
             return x + 1
@@ -1346,6 +1357,53 @@ def forward(self, x):
         ref, loaded = self._test_serialization("TENSOR_MATCH", m, torch.randn(3, 2))
         self._test_check_fn(ref, loaded, {"self": m, "x": torch.randn(3, 2)}, True)
 
+    def test_bound_method_input(self):
+        class MyModule(torch.nn.Module):
+            def forward(self, foo, x):
+                return x + id(type(foo))
+
+        m = MyModule()
+        ref, loaded = self._test_serialization(
+            "TYPE_MATCH", m, MyClass().add, torch.randn(3, 2)
+        )
+        self._test_check_fn(
+            ref, loaded, {"self": m, "foo": MyClass().add, "x": torch.randn(3, 2)}, True
+        )
+
+
+class SimpleModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.p = torch.nn.Parameter(torch.randn(3, 2))
+
+    def forward(self, x):
+        z = x + 1
+        for p in self.parameters():
+            z += p
+        return z
+
+
+if not IS_MACOS:
+    from torch.testing._internal.common_fsdp import FSDPTestMultiThread
+
+    @torch._dynamo.config.patch({"strict_precompile": True})
+    class TestGuardSerializationFSDP(TestGuardSerializationBase, FSDPTestMultiThread):
+        def setUp(self):
+            TestGuardSerializationBase.setUp(self)
+            FSDPTestMultiThread.setUp(self)
+
+        def test_guard_serialization_fsdp_module(self):
+            from torch.distributed._tensor import distribute_tensor, Replicate
+            from torch.distributed.device_mesh import init_device_mesh
+            from torch.distributed.fsdp import fully_shard
+
+            mesh = init_device_mesh(str(torch.get_default_device()), (1,))
+            m = SimpleModule()
+            m = fully_shard(m, mesh=mesh)
+            inputs = distribute_tensor(torch.randn(3, 2), mesh, [Replicate()])
+            ref, loaded = self._test_serialization("TENSOR_MATCH", m, inputs)
+            self._test_check_fn(ref, loaded, {"self": m, "x": inputs}, True)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 9f093d4dc0cea..78943b41bc262 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -7197,6 +7197,7 @@ def false_branch(x):
     # aot_eager
     "map",  # assert type(args[1].realize()) is TensorVariable
     "scan",  # scan is not an OpOverload
+    "local_map_hop",  # can't retrace
     # inductor
     "while_loop",  # LoweringException: AssertionError
     "flex_attention",  # LoweringException: AssertionError
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index 2a83b28b50a9c..b018c7565dd1b 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -137,7 +137,11 @@ def test_fusion(self, records):
         fn_opt = torch.compile(inductor_schedule_fn, backend="inductor")
         fn_opt(torch.ones(1000, 1000, device=device_type))
         self.assertGreater(len(records), 0)
-        self.assertLess(len(records), 8)
+
+        # LOAF will add an extra round of fusion and result in more logs
+        self.assertLess(
+            len(records), 8 * (1 + torch._inductor.config.loop_ordering_after_fusion)
+        )
 
     @requires_cuda_and_triton
     @make_logging_test(cudagraphs=True)
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index b7fb01be17152..e00d1e685bf5b 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -50,7 +50,6 @@
     CompileCounter,
     CompileCounterWithBackend,
     expectedFailureDynamic,
-    requiresPy310,
     same,
     skipIfNotPy311,
     unsupported,
@@ -8467,43 +8466,24 @@ def write_state(state):
         def fn(x):
             return x + 1
 
-        import contextlib
-
-        @contextlib.contextmanager
-        def _hip_allow_tf32():
-            # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-            # and only for MI300+
-            hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-
-            try:
-                yield
-            finally:
-                if hip_allow_tf32 is not None:
-                    os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-                else:
-                    del os.environ["HIPBLASLT_ALLOW_TF32"]
-
-        tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-        with tf32_ctx():
-            initial_state = read_state()
-            y = torch.randn(10)
-            try:
-                for round in range(3):
-                    for i in range(len(initial_state)):
-                        new_state = [False] * len(initial_state)
-                        new_state[i] = True
-                        write_state(new_state)
-                        assert read_state() == new_state
-                        last_state.clear()
-                        fn(y)
-                        assert last_state == new_state
-                        if round == 0:
-                            assert cnt == i + 1
-                        else:
-                            assert cnt == len(initial_state)
-            finally:
-                write_state(initial_state)
+        initial_state = read_state()
+        y = torch.randn(10)
+        try:
+            for round in range(3):
+                for i in range(len(initial_state)):
+                    new_state = [False] * len(initial_state)
+                    new_state[i] = True
+                    write_state(new_state)
+                    assert read_state() == new_state
+                    last_state.clear()
+                    fn(y)
+                    assert last_state == new_state
+                    if round == 0:
+                        assert cnt == i + 1
+                    else:
+                        assert cnt == len(initial_state)
+        finally:
+            write_state(initial_state)
 
     def test_grad_state_mutated(self):
         prior = torch.is_grad_enabled()
@@ -8622,46 +8602,63 @@ def global_context_capture_fn(frame_summary):
         self.assertEqual(seen_frames[0].line, "r, r2 = uwu_inline_me(x, y, z)")
 
     def test_fullgraph_capture(self):
-        from torch._dynamo.convert_frame import (
-            FrameInfo,
-            fullgraph_capture,
-            get_compile_id,
-        )
+        from torch._dynamo.convert_frame import fullgraph_capture
         from torch._dynamo.utils import dynamo_timed, get_metrics_context
-        from torch._guards import compile_context, CompileContext
 
         def foo(x):
-            return x + x.shape[0]
+            if x.shape[1] >= 3:
+                return x + x.shape[0]
+            else:
+                return x - x.shape[0]
 
         x = torch.randn(4, 3)
-        f_locals = {"x": x}
         with (
-            compile_context(CompileContext(get_compile_id({}))),
-            dynamo_timed(""),
             get_metrics_context(),
+            dynamo_timed(""),
         ):
-            capture_output = fullgraph_capture(
-                FrameInfo(
-                    foo.__code__,
-                    foo.__globals__,
-                    f_locals,
-                    builtins,
-                    (),
-                )
-            )
-            dynamo_output = capture_output.dynamo_output
+            capture_output = fullgraph_capture(foo, (x,))
+            graph_capture_output = capture_output.graph_capture_output
+            fn = graph_capture_output.build_guards(foo.__code__)
+
+            for guard in graph_capture_output.output_graph.guards:
+                if guard.source == torch._guards.GuardSource.SHAPE_ENV:
+                    dynamic = guard.code_list is not None
+                    if dynamic:
+                        self.assertEqual(
+                            guard.code_list,
+                            [
+                                "L['x'].stride()[0] == L['x'].size()[1]",
+                                "2 <= L['x'].size()[0]",
+                                "3 <= L['x'].size()[1]",
+                            ],
+                        )
+                        self.assertTrue(
+                            fn.guard_manager.check({"x": torch.randn(3, 3)})
+                        )
+                        self.assertTrue(
+                            fn.guard_manager.check({"x": torch.randn(4, 4)})
+                        )
+                    else:
+                        self.assertFalse(
+                            fn.guard_manager.check({"x": torch.randn(3, 3)})
+                        )
+                        self.assertFalse(
+                            fn.guard_manager.check({"x": torch.randn(4, 4)})
+                        )
+                    self.assertFalse(fn.guard_manager.check({"x": torch.randn(4, 2)}))
+                    self.assertFalse(fn.guard_manager.check({"x": torch.randn(1, 3)}))
+                    break
+
             backend_input = capture_output.backend_input
-            self.assertTrue(
-                dynamo_output.build_guards(foo.__code__).guard_manager.check(f_locals)
-            )
+            self.assertTrue(fn.guard_manager.check({"x": x}))
         import_sources = {
             alias: importlib.import_module(module_name)
-            for alias, module_name in dynamo_output.tracer_output.output_graph.import_sources.items()
+            for alias, module_name in graph_capture_output.import_sources.items()
         }
         self.assertEqual(
             foo(x),
             types.FunctionType(
-                dynamo_output.bytecode,
+                graph_capture_output.bytecode,
                 {
                     **import_sources,
                     backend_input.backend_id: backend_input.graph_module,
@@ -9593,6 +9590,69 @@ def f(x, i):
 
             f(torch.randn(9, requires_grad=True), torch.tensor([3, 6]))
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_dim_order(self):
+        @torch.compile(dynamic=False, fullgraph=True, backend="eager")
+        def f(x):
+            x = x.permute(3, 0, 2, 1)
+            return x, x.dim_order()
+
+        @torch.compile(dynamic=False, fullgraph=True, backend="eager")
+        def g(x):
+            return x.dim_order()
+
+        @torch.compile(dynamic=False, fullgraph=True, backend="eager")
+        def h0(xs, ambiguity_check=False):
+            u0, u1, u2 = xs.tolist()
+            torch._check(u2 >= u0)
+            torch._check(u1 >= u0)
+            # stride ordering still isn't unique here, should raise
+            y = torch.empty_strided([4, 4, 4], [u0, u1, u2])
+            return y.dim_order(ambiguity_check=ambiguity_check)
+
+        @torch.compile(dynamic=False, fullgraph=True, backend="eager")
+        def h1(xs, ambiguity_check=False):
+            u0, u1, u2 = xs.tolist()
+            y = torch.empty_strided([4, 4, 4], [u0, u0, u0])  # no ordering
+            return y.dim_order(ambiguity_check=ambiguity_check)
+
+        # check that for functions permuting contiguous input, the original stride is recovered with dim_order.
+        def test(x):
+            stride_inp = tuple(x.stride())
+            f_out, f_order = f(x)
+            self.assertEqual(stride_inp, tuple(f_out.stride(i) for i in f_order))
+
+        # shape: [4, u0, 5, u1]
+        x0 = torch.randn(4, 1, 5, 2)
+        torch._dynamo.decorators.mark_unbacked(x0, 1)
+        torch._dynamo.decorators.mark_unbacked(x0, 3)
+        test(x0)
+
+        # shape: [u0, u1, u2, u3]
+        x1 = torch.randn(4, 1, 5, 2)
+        for i in range(x1.ndim):
+            torch._dynamo.decorators.mark_unbacked(x1, i)
+        test(x1)
+
+        # custom strides (all integers)
+        x2 = torch.randn(10000)
+        x2 = x2.as_strided([4, 4, 4, 4], [1, 2, 4, 8])
+        assert g(x2) == (3, 2, 1, 0)
+
+        # custom unbacked strides with no ordering: ambiguity check should raise
+        xs = torch.tensor([2, 3, 4])
+        h0(xs)
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.TorchRuntimeError,
+            r"The tensor does not have unique dim order.",
+        ):
+            h0(xs, ambiguity_check=True)
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.TorchRuntimeError,
+            r"The tensor does not have unique dim order.",
+        ):
+            h1(xs, ambiguity_check=True)
+
     def test_str_format_assert1(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(img):
@@ -10554,7 +10614,6 @@ def fn(x, y):
 
         self.assertEqual(actual, expected)
 
-    @requiresPy310
     def test_frozen_dataclass_kw_only(self):
         @dataclasses.dataclass(frozen=True)
         class TestDataClass:
@@ -12684,6 +12743,22 @@ def f(*args, **kwargs):
         self.assertRaises(Unsupported, f, [])
         self.assertRaises(Unsupported, f, "1 + j")
 
+    def test_compiled_class_graph_break(self):
+        counter = CompileCounter()
+
+        @torch.compile(backend=counter, fullgraph=False)
+        def f(x):
+            x += 1
+
+            class C:
+                pass
+
+            return x.sin()
+
+        x = torch.randn(3)
+        f(x)
+        self.assertEqual(counter.frame_count, 2)
+
 
 class MiscTestsPyTree(torch._inductor.test_case.TestCase):
     @parametrize_pytree_module
@@ -13203,7 +13278,7 @@ def f(rank):
         self.assertEqual(out, opt_out)
 
     @unittest.skipIf(not TEST_MULTIGPU, "need multiple GPU")
-    def test_cuda_set_device(self, device):
+    def test_gpu_set_device(self, device):
         def fn():
             a = torch.ones(2, device=device)
             torch.get_device_module(device).set_device(1)
@@ -13283,6 +13358,34 @@ def f(actions, n_act, epsilon=0.1):
         y = torch.tensor(5)
         f(x, y)
 
+    def test_full_graph_capture_scalar_outputs(self):
+        @torch.compile(fullgraph=True)
+        def foo(a):
+            return torch.randn(5) * a.item()
+
+        # We expect to no longer raise here
+        foo(torch.tensor(2.0))
+
+    def test_full_graph_capture_dynamic_output_shape_ops(self):
+        def fn(x):
+            nz = torch.nonzero(x)
+            squared = nz * nz
+            sliced = torch.ops.aten.slice.Tensor(squared, dim=1, start=-2, end=None)
+            view = sliced.unsqueeze(dim=0)
+            return view.squeeze(dim=0)
+
+        example_inputs = (torch.randn(1, 1, 1, 1),)
+        # we expect to no longer raise here
+        torch.compile(fn, fullgraph=True)(*example_inputs)
+
+    def test_dynamic_fill_diagonal_(self):
+        @torch.compile(dynamic=True)
+        def f(x):
+            x.fill_diagonal_(True)
+
+        x = torch.zeros(4, 4)
+        f(x)
+
     def test_dynamic_float_scalar_tensor_coersion(self):
         # Minified version of https://github.com/pytorch/pytorch/issues/158376#issuecomment-3079591367
         class Foo:
diff --git a/test/dynamo/test_modes.py b/test/dynamo/test_modes.py
index 818e5a85aa26d..82c87bde8c0ba 100644
--- a/test/dynamo/test_modes.py
+++ b/test/dynamo/test_modes.py
@@ -12,7 +12,11 @@
     _push_on_torch_function_stack,
 )
 from torch._dynamo.utils import counters
-from torch.overrides import _get_current_function_mode_stack, BaseTorchFunctionMode
+from torch.overrides import (
+    _get_current_function_mode_stack,
+    BaseTorchFunctionMode,
+    TorchFunctionMode,
+)
 from torch.testing._internal.common_utils import skipIfXpu
 from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.testing._internal.triton_utils import requires_gpu
@@ -190,6 +194,19 @@ def test_torch_function_mode_guards_py(self):
     def test_torch_function_mode_guards_cpp(self):
         self._run_torch_function_mode_guard_test()
 
+    @requires_gpu
+    def test_torch_function_mode_preserves_cuda_rng_state(self):
+        class ConstantReturnMode(TorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                return -42
+
+        @torch._dynamo.optimize("eager")
+        def fn():
+            with ConstantReturnMode():
+                return 123
+
+        self.assertEqual(fn(), 123)
+
     def test_stack_state_mutation_default_device(self):
         m = BaseTorchFunctionMode()
         m1 = BaseTorchFunctionMode()
diff --git a/test/dynamo/test_package.py b/test/dynamo/test_package.py
index 96a726ad66808..f69747bd36e1f 100644
--- a/test/dynamo/test_package.py
+++ b/test/dynamo/test_package.py
@@ -16,13 +16,10 @@
 from torch._dynamo.precompile_context import PrecompileContext
 from torch._dynamo.testing import reduce_to_scalar_loss
 from torch._functorch import config as functorch_config
-from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
-    skipIfRocm,
-    skipIfXpu,
 )
 from torch.testing._internal.inductor_utils import (
     HAS_CUDA_AND_TRITON,
@@ -50,9 +47,7 @@ def setUp(self):
         DynamoCache.clear()
         PrecompileContext.clear()
 
-    def _save_and_reload(
-        self, expected_backends, expected_dynamo, expected_autotune=None
-    ):
+    def _save_and_reload(self, expected_backends, expected_dynamo):
         """
         Serializes all artifacts, clears all caches, then reloads the serialized artifact
         Simulates a new process.
@@ -61,24 +56,12 @@ def _save_and_reload(
             expected_backends: Expected number of precompile_aot_autograd_artifacts
             expected_dynamo: Expected number of precompile_dynamo_artifacts
         """
-        serialized = PrecompileContext.serialize()
-        assert serialized is not None
-        (bytes_, cache_info) = serialized
-        self.assertEqual(
-            len(cache_info.precompile_aot_autograd_artifacts), expected_backends
-        )
-        self.assertEqual(len(cache_info.precompile_dynamo_artifacts), expected_dynamo)
-        if expected_autotune is not None:
-            self.assertEqual(len(cache_info.autotune_artifacts), expected_autotune)
-
+        debug_info = PrecompileContext.save_to_dynamo_cache()
+        self.assertEqual(len(debug_info["dynamo"]), expected_dynamo)
+        self.assertEqual(len(debug_info["backends"]), expected_backends)
         torch._dynamo.reset()
-        DynamoCache.clear()
         PrecompileContext.clear()
 
-        deserialized = PrecompileContext.deserialize(bytes_)
-        assert deserialized is not None
-        PrecompileContext.populate_caches(deserialized)
-
     @unittest.expectedFailure  # FUNCTION_MATCH guard not serializable today
     def test_nn_module(self):
         class MyModule(torch.nn.Module):
@@ -440,41 +423,6 @@ def fn2(x):
             self.assertEqual(expected, [result1, result2])
         self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
 
-    @parametrize("device", ("cuda", "xpu"))
-    @torch._dynamo.config.patch(caching_precompile=True)
-    @skipIfXpu
-    @skipIfRocm
-    def test_automatic_dynamo_autotune_cache(self, device):
-        if device == "cuda" and not HAS_CUDA_AND_TRITON:
-            raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU_AND_TRITON:
-            raise unittest.SkipTest("Requires XPU/Triton")
-
-        def fn(x, y):
-            return x.sin() + y
-
-        arg1 = torch.randn(3, 3, device=device)
-        arg2 = torch.randn(3, 3, device=device)
-        expected = fn(arg1, arg2).clone()
-
-        with PatchCaches():
-            compiled_fn1 = torch.compile(fn, mode="max-autotune")
-            result = compiled_fn1(arg1, arg2).clone()
-            self.assertEqual(expected, result)
-            self.assertEqual(global_stats.autotune_local, Stats(1, 0, 1))
-            DynamoCache.clear()
-
-            total_frames = torch._dynamo.convert_frame.FRAME_COUNTER
-            self._save_and_reload(
-                expected_backends=1, expected_dynamo=1, expected_autotune=1
-            )
-            compiled_fn1 = torch.compile(fn, mode="max-autotune")
-            with torch.compiler.set_stance("fail_on_recompile"):
-                result1 = compiled_fn1(arg1, arg2).clone()
-                self.assertEqual(expected, result1)
-            self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
-            self.assertEqual(global_stats.autotune_local, Stats(2, 1, 1))
-
     @parametrize("device", ("cpu", "cuda", "xpu"))
     @torch._dynamo.config.patch(caching_precompile=True)
     def test_automatic_dynamo_recompiles(self, device):
diff --git a/test/dynamo/test_precompile_context.py b/test/dynamo/test_precompile_context.py
index b509adf281129..6c72f65f53ae2 100644
--- a/test/dynamo/test_precompile_context.py
+++ b/test/dynamo/test_precompile_context.py
@@ -1,16 +1,9 @@
 # Owner(s): ["module: dynamo"]
-
-import pickle
-
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
 import torch._functorch
-from torch._dynamo.precompile_context import (
-    EditablePrecompileCacheArtifact,
-    PrecompileCacheArtifact,
-    PrecompileContext,
-)
+from torch._dynamo.precompile_context import BackendCacheArtifact, PrecompileContext
 from torch._functorch import config as functorch_config
 from torch._functorch._aot_autograd.autograd_cache import (
     BundledAOTAutogradCacheArtifact,
@@ -47,32 +40,13 @@ def simple_function(x):
         x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
         result = compiled_fn(x)
         result.sum().backward()
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
-
-        result = PrecompileContext.serialize()
-        assert result is not None
-        serialized, cache_info = result
-        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
-
-        artifacts = PrecompileContext.deserialize(serialized)
-        assert artifacts is not None
-        deserialized = artifacts["precompile_aot_autograd"]
-        assert len(deserialized) == 1
-        entry = deserialized[0]
-        assert isinstance(entry, BundledAOTAutogradCacheArtifact)
-        entry = entry.after_deserialization()
-        # Now that we've serialized, there should be no new cache artifacts
-        self.assertEqual(
-            len(PrecompileContext._new_cache_artifacts["precompile_aot_autograd"]), 0
-        )
+        self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
+        self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
+        cache_entries, _ = PrecompileContext.create_cache_entries()
+        self.assertEqual(len(cache_entries), 1)
 
     @requires_triton()
     def test_serialize_by_key(self):
-        """
-        Test that after torch.compile, PrecompileContext._new_cache_artifacts length is 1
-        """
-
         def simple_function(x):
             return x.sin() + x.cos()
 
@@ -82,17 +56,16 @@ def simple_function(x):
         x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
         result = compiled_fn(x)
         result.sum().backward()
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
-        for key in PrecompileContext._new_cache_artifacts_by_key.keys():
+        self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
+        self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
+        for key in PrecompileContext._backend_artifacts_by_key.keys():
             result = PrecompileContext.serialize_artifact_by_key(key)
-            assert isinstance(result, PrecompileCacheArtifact)
+            assert isinstance(result, BackendCacheArtifact)
             self.assertEqual(result.key, key)
 
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
-        result = PrecompileContext.serialize()
-        assert result is not None
-        _, cache_info = result
-        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
+        # This should still work
+        result, _ = PrecompileContext.create_cache_entries()
+        assert len(result) == 1
 
     @requires_triton()
     def test_editable(self):
@@ -109,13 +82,10 @@ def simple_function(x):
         x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
         result = compiled_fn(x)
         result.sum().backward()
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
+        self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
+        self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
         # Find the key for the artifact of type "precompile_aot_autograd"
-        key = next(
-            k
-            for k, v in PrecompileContext._new_cache_artifacts_by_key.items()
-            if isinstance(v, EditablePrecompileCacheArtifact)
-        )
+        key = next(iter(PrecompileContext._backend_artifacts_by_key))
 
         def edit_fn(x):
             x._my_private_field = 42
@@ -127,24 +97,12 @@ def edit_fn(x):
         assert isinstance(result, BundledAOTAutogradCacheArtifact)
         self.assertEqual(result.key, key)
 
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
-        result = PrecompileContext.serialize()
-        assert result is not None
-        artifacts, cache_info = result
-        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
-
-        deserialized = PrecompileContext.deserialize(artifacts)
-        assert deserialized is not None
-        aot_autograd_artifacts = deserialized["precompile_aot_autograd"]
+        result, _ = PrecompileContext.create_cache_entries()
+        assert len(result) == 1
+        aot_autograd_artifacts = next(iter(result.values())).backends
         assert len(aot_autograd_artifacts) == 1
-        entry = aot_autograd_artifacts[0]
-        assert isinstance(entry, BundledAOTAutogradCacheArtifact)
-        raw_entry = pickle.loads(entry.content)
-        self.assertEqual(raw_entry._my_private_field, 42)
-        # Now that we've serialized, there should be no new cache artifacts
-        self.assertEqual(
-            len(PrecompileContext._new_cache_artifacts["precompile_aot_autograd"]), 0
-        )
+        entry = next(iter(aot_autograd_artifacts.values())).content
+        self.assertEqual(entry._my_private_field, 42)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 46f0e67a5b7b4..e9cc4e5026f52 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -4822,6 +4822,67 @@ def foo(a):
             "encountered a mutation on a view chain of length 2, where view 1 was an as_strided",
         ):
             f_compiled(a)
+        # See https://github.com/pytorch/pytorch/issues/161010
+
+    def test_preserve_stride_with_clone(self) -> None:
+        A = torch.rand(5, 5, device="cuda" if torch.cuda.is_available() else "cpu")
+        B = torch.rand(5, 5, device="cuda" if torch.cuda.is_available() else "cpu")
+
+        def fn(
+            src: torch.Tensor, count: torch.Tensor
+        ) -> tuple[tuple[int, ...], tuple[int, ...]]:
+            Q, R = torch.linalg.qr(src)
+            rhs = torch.ones(Q.shape[0], 1, device=src.device)
+            a = torch.linalg.solve_triangular(R, Q.T @ rhs, upper=True)
+            cloned = a.clone(memory_format=torch.preserve_format)
+            return a.stride(), cloned.stride()
+
+        a_stride, cloned_stride = fn(A, torch.zeros(1))
+        self.assertEqual(
+            a_stride,
+            cloned_stride,
+            f"Strides should match in eager: {a_stride} against {cloned_stride}",
+        )
+
+        compiled_a_stride, compiled_cloned_stride = torch.compile(fn, backend="eager")(
+            B, torch.zeros(1)
+        )
+        self.assertEqual(
+            compiled_a_stride,
+            compiled_cloned_stride,
+            f"Strides should match in eager: {compiled_a_stride} against {compiled_cloned_stride}",
+        )
+
+    # Extension of https://github.com/pytorch/pytorch/issues/161010
+    # in the non memory dense case
+    def test_clone_not_memory_dense(self):
+        def foo() -> torch.Tensor:
+            x = torch.randn(10, 8).t()[::2, ::2]
+            y = x.clone()
+            return y
+
+        y = foo()
+        self.assertEqual(
+            y.stride(),
+            (1, 4),
+            "Reference eager implementation should have stride (1, 4)",
+        )
+        y = torch.compile(foo, backend="eager")()
+        self.assertEqual(
+            y.stride(), (1, 4), "Compile with eager backend should have stride (1, 4)"
+        )
+        y = torch.compile(foo, backend="aot_eager")()
+        self.assertEqual(
+            y.stride(),
+            (1, 4),
+            "Compile with aot_eager backend should have stride (1, 4)",
+        )
+        y = torch.compile(foo, backend="inductor")()
+        self.assertEqual(
+            y.stride(),
+            (1, 4),
+            "Compile with inductor backend should have stride (1, 4)",
+        )
 
     # https://github.com/pytorch/pytorch/issues/146598
     @unittest.expectedFailure
diff --git a/test/dynamo/test_sdpa.py b/test/dynamo/test_sdpa.py
index 5307c5b3cc3f4..02a867af76d53 100644
--- a/test/dynamo/test_sdpa.py
+++ b/test/dynamo/test_sdpa.py
@@ -5,6 +5,7 @@
 import torch._dynamo.testing
 from torch._dynamo.testing import CompileCounter
 from torch.backends.cuda import SDPAParams
+from torch.nn.attention import _cur_sdpa_kernel_backends, sdpa_kernel, SDPBackend
 
 
 @contextlib.contextmanager
@@ -99,6 +100,43 @@ def fn(q, k, v, m):
             self.assert_ref_equals_params(o, expected)
             self.assertEqual(counter.frame_count, 1)
 
+    def test_sdpa_c_functions_no_graph_break(self):
+        counter = CompileCounter()
+
+        @torch.compile(fullgraph=True, backend=counter)
+        def test_cur_sdpa_kernel_backends():
+            return _cur_sdpa_kernel_backends()
+
+        result = test_cur_sdpa_kernel_backends()
+
+        self.assertIsInstance(result, list)
+        self.assertEqual(counter.frame_count, 1)
+
+    def test_sdpa_kernel_decorator_with_compile(self):
+        SDPA_BACKEND_PRIORITY = [
+            SDPBackend.MATH,
+            SDPBackend.EFFICIENT_ATTENTION,
+            SDPBackend.FLASH_ATTENTION,
+        ]
+
+        @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True)
+        def scaled_dot_product_attention(q, k, v, *args, **kwargs):
+            return torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, *args, **kwargs
+            )
+
+        counter = CompileCounter()
+
+        @torch.compile(fullgraph=True, backend=counter)
+        def f(x):
+            return scaled_dot_product_attention(x, x, x)
+
+        x = torch.rand(128, 64, 64, 256, dtype=torch.float16)
+        result = f(x)
+
+        self.assertEqual(result.shape, x.shape)
+        self.assertEqual(counter.frame_count, 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
index d149f5b36f9c1..d4392ff0d9edf 100644
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@@ -109,6 +109,8 @@ def format(self, record):
             metadata["dynamo_start"]["stack"] = "STACK"
         if "inductor_output_code" in metadata:
             metadata["inductor_output_code"]["filename"] = "FILENAME"
+            if "file_path" in metadata["inductor_output_code"]:
+                metadata["inductor_output_code"]["file_path"] = "FILENAME"
         if "stack" in metadata:
             metadata["stack"] = "STACK"
         if "compilation_metrics" in metadata:
@@ -259,7 +261,7 @@ def test_schedule(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -294,7 +296,7 @@ def test_cudagraphs(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -336,7 +338,7 @@ def fn(x, y):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -358,7 +360,7 @@ def fn(x, y):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
@@ -390,7 +392,7 @@ def test_example_fn(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -447,7 +449,7 @@ def test_example_training_fn(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
@@ -456,7 +458,7 @@ def test_example_training_fn(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@@ -599,7 +601,7 @@ def forward(self, x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -612,7 +614,7 @@ def forward(self, x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -680,7 +682,7 @@ def forward(self, x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@@ -699,7 +701,7 @@ def forward(self, x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@@ -740,7 +742,7 @@ def fn(x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
@@ -901,7 +903,7 @@ def fn(a):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -916,7 +918,7 @@ def fn(a):
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "inductor_provenance_tracking_kernel_stack_traces", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_new_spectral_norm_forward_swap_True b/test/dynamo_expected_failures/AOTFxirTestCase.test_aoti_fx_const
similarity index 100%
rename from test/dynamo_expected_failures/TestNNParametrization.test_new_spectral_norm_forward_swap_True
rename to test/dynamo_expected_failures/AOTFxirTestCase.test_aoti_fx_const
diff --git a/torch/utils/bottleneck/__init__.py b/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked
similarity index 100%
rename from torch/utils/bottleneck/__init__.py
rename to test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked
diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_convert_to_bool b/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_convert_to_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373 b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing b/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing_error b/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_except_stopiter b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_except_stopiter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_nokeepref b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_nokeepref
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_exit_is_abstract b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_exit_is_abstract
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_slots b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_slots
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_decorating_method b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_decorating_method
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_enter b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_enter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_exit b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_exit
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_dont_reraise_RuntimeError b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_dont_reraise_RuntimeError
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context_errors b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context_errors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_chaining_reference b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_chaining_reference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_explicit_none_context b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_explicit_none_context
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_instance_bypass b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_instance_bypass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_push b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_push
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_recursive_repr b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_recursive_repr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_bad_key b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_bad_key
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_maintains_tracking b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_maintains_tracking
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_contain_use_after_free b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_contain_use_after_free
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_eq b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_eq
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_equal_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_equal_operator_modifying_operand
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_errors_in_view_containment_check b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_errors_in_view_containment_check
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_dict_operand b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_dict_operand
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_set_operand b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_set_operand
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_init_use_after_free b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_init_use_after_free
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_instance_dict_getattr_str_subclass b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_instance_dict_getattr_str_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_invalid_keyword_arguments b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_invalid_keyword_arguments
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_and_mutate b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_and_mutate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_missing b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_missing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_lookup b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_lookup
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_object_set_item_single_instance_non_str_key b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_object_set_item_single_instance_non_str_key
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_oob_indexing_dictiter_iternextitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_oob_indexing_dictiter_iternextitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_pop b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_pop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_resize2 b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_resize2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_shared_shared_dicts b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_shared_shared_dicts
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault_atomic b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault_atomic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setitem_atomic_at_resize b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setitem_atomic_at_resize
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_to_generic_combinedtable b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_to_generic_combinedtable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_update b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_str_nonstr b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_str_nonstr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_views_mapping b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_views_mapping
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_floatconversion b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_floatconversion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_keywords_in_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_non_numeric_input_types b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_non_numeric_input_types
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_subclass b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_indexable b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_indexable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_returns_int_subclass b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_returns_int_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_index b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_index
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_int b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_int
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_intconversion b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_intconversion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_non_numeric_input_types b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_non_numeric_input_types
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_3720 b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_3720
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_ref_counting_behavior b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_ref_counting_behavior
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_stop_sequence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_stop_sequence
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_unicode_join_endcase b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_unicode_join_endcase
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_constructors b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_constructors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains_order b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains_order
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_equal_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_equal_operator_modifying_operand
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extend b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extend
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_keywords_in_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_no_comdat_folding b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_no_comdat_folding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr_mutate b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr_mutate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCeil b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCeil
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFloor b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFloor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_issue39871 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_issue39871
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_stress b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_stress
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_trunc b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_trunc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-CCOperatorPickleTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-CCOperatorPickleTestCase.test_attrgetter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-CCOperatorPickleTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-CCOperatorPickleTestCase.test_methodcaller
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_attrgetter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_eq b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_eq
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_index b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_index
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_inplace b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_inplace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_length_hint b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_length_hint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_ne b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_ne
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_not_ b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_not_
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_truth b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_truth
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-CPyOperatorPickleTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-CPyOperatorPickleTestCase.test_attrgetter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-CPyOperatorPickleTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-CPyOperatorPickleTestCase.test_methodcaller
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyCOperatorPickleTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-PyCOperatorPickleTestCase.test_attrgetter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyCOperatorPickleTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-PyCOperatorPickleTestCase.test_methodcaller
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_attrgetter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_index b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_index
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_inplace b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_inplace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_length_hint b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_length_hint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_matmul b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_matmul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_methodcaller
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_not_ b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_not_
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyPyOperatorPickleTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-PyPyOperatorPickleTestCase.test_attrgetter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyPyOperatorPickleTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-PyPyOperatorPickleTestCase.test_methodcaller
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem_hash_collision
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem_hash_collision
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init_calls
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_clear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_delete_key
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_clear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key_in_dict_eq b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key_in_dict_eq
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24347
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24348
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem_hash_collision
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init_calls
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_clear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_delete_key
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_clear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key_in_dict_eq b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key_in_dict_eq
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24347
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24348
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem_hash_collision
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init_calls
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue119004_attribute_error b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue119004_attribute_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24347
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24348
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_overridden_init
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_override_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem_hash_collision
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init_calls
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue119004_attribute_error b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue119004_attribute_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24347
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24348
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_overridden_init
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_override_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_and_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_eq_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ge_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_gt_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iadd_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ior_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_isub_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iteration_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ixor_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_le_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_lt_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ne_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_or_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_sub_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_xor_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_and_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_eq_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ge_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_gt_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iadd_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ior_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_isub_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iteration_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ixor_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_le_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_lt_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ne_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_or_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_sub_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_xor_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_and_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_eq_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ge_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_gt_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iadd_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ior_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_isub_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iteration_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ixor_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_le_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_lt_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ne_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_or_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_sub_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_xor_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_and_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_eq_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ge_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_gt_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iadd_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ior_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_isub_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iteration_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ixor_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_le_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_lt_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ne_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_or_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_sub_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_xor_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_container_iterator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_deepcopy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_gc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_subclass_with_custom_hash
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_container_iterator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_deepcopy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_gc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_keywords_in_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_subclass_with_custom_hash
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_isdisjoint_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issubset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issuperset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_union_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_isdisjoint_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issubset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issuperset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_union_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_isdisjoint_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issubset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issuperset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_union_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_isdisjoint_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issubset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issuperset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_union_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_isdisjoint_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issubset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issuperset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_union_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_isdisjoint_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issubset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issuperset_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_union_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_update_with_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_container_iterator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_deepcopy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_gc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_rich_compare b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_rich_compare
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_subclass_with_custom_hash
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_container_iterator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_deepcopy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_gc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_keywords_in_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_rich_compare b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_rich_compare
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_subclass_with_custom_hash
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestWeirdBugs.test_merge_and_mutate b/test/dynamo_expected_failures/CPython313-test_set-TestWeirdBugs.test_merge_and_mutate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBase.testStressfully b/test/dynamo_expected_failures/CPython313-test_sort-TestBase.testStressfully
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBugs.test_bug453523 b/test/dynamo_expected_failures/CPython313-test_sort-TestBugs.test_bug453523
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_object_compare b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_object_compare
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_constructors b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_constructors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains_order b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains_order
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_keywords_in_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_no_comdat_folding b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_no_comdat_folding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_subtypes b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_subtypes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_eq b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_eq
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains_order b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains_order
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testErrorsInBool b/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testErrorsInBool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testRaisedStopIteration2 b/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testRaisedStopIteration2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterAttributeError1 b/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterAttributeError1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterAttributeError2 b/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterAttributeError2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testExitAttributeError b/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testExitAttributeError
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_anomaly_detect_nan b/test/dynamo_expected_failures/TestAutograd.test_anomaly_detect_nan
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_autograd_print_tensor b/test/dynamo_expected_failures/TestAutograd.test_autograd_print_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_checkpointing_without_reentrant_with_context_fn b/test/dynamo_expected_failures/TestAutograd.test_checkpointing_without_reentrant_with_context_fn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_custom_autograd_repeated_grad_grad b/test/dynamo_expected_failures/TestAutograd.test_custom_autograd_repeated_grad_grad
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_inplace_not_requires_grad b/test/dynamo_expected_failures/TestAutograd.test_inplace_not_requires_grad
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_lobpcg b/test/dynamo_expected_failures/TestAutograd.test_lobpcg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_mark_non_differentiable b/test/dynamo_expected_failures/TestAutograd.test_mark_non_differentiable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_mark_non_differentiable_none b/test/dynamo_expected_failures/TestAutograd.test_mark_non_differentiable_none
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_naughty_autograd_function_stashing_ctx b/test/dynamo_expected_failures/TestAutograd.test_naughty_autograd_function_stashing_ctx
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_return_leaf_inplace b/test/dynamo_expected_failures/TestAutograd.test_return_leaf_inplace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_const_fold_basic_one_attr_name_collision b/test/dynamo_expected_failures/TestConstFold.test_const_fold_basic_one_attr_name_collision
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_const_fold_basic_one_attr_no_name_collision b/test/dynamo_expected_failures/TestConstFold.test_const_fold_basic_one_attr_no_name_collision
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_const_fold_basic_two_attr b/test/dynamo_expected_failures/TestConstFold.test_const_fold_basic_two_attr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_const_fold_basic_two_attr_three_input b/test/dynamo_expected_failures/TestConstFold.test_const_fold_basic_two_attr_three_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_const_fold_has_inlined_call_module_node b/test/dynamo_expected_failures/TestConstFold.test_const_fold_has_inlined_call_module_node
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_const_fold_module_attr b/test/dynamo_expected_failures/TestConstFold.test_const_fold_module_attr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_const_fold_submod_hierarchy b/test/dynamo_expected_failures/TestConstFold.test_const_fold_submod_hierarchy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_const_fold_unused_placeholder b/test/dynamo_expected_failures/TestConstFold.test_const_fold_unused_placeholder
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_dict_output b/test/dynamo_expected_failures/TestConstFold.test_dict_output
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_fold_module b/test/dynamo_expected_failures/TestConstFold.test_fold_module
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_three_outputs b/test/dynamo_expected_failures/TestConstFold.test_three_outputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConstFold.test_two_outputs b/test/dynamo_expected_failures/TestConstFold.test_two_outputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_cond_merge_graph_preserves_ph_meta b/test/dynamo_expected_failures/TestControlFlowTraced.test_cond_merge_graph_preserves_ph_meta
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_cond_mismatched_branch_output_dynamic_True_backend_eager b/test/dynamo_expected_failures/TestControlFlowTraced.test_cond_mismatched_branch_output_dynamic_True_backend_eager
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_cond_symint_operands_requires_grad_False b/test/dynamo_expected_failures/TestControlFlowTraced.test_cond_symint_operands_requires_grad_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_cond_symint_operands_requires_grad_True b/test/dynamo_expected_failures/TestControlFlowTraced.test_cond_symint_operands_requires_grad_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_while_loop_autograd_simple b/test/dynamo_expected_failures/TestControlFlowTraced.test_while_loop_autograd_simple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_python_inter_op b/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_python_inter_op
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_python_inter_op_with_cuda b/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_python_inter_op_with_cuda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFX.test_custom_traceback_not_raised_when_exception_source_is_submodule b/test/dynamo_expected_failures/TestFX.test_custom_traceback_not_raised_when_exception_source_is_submodule
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFXNumericSuiteCoreAPIs.test_user_defined_function b/test/dynamo_expected_failures/TestFXNumericSuiteCoreAPIs.test_user_defined_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFlag.test_writeable_any_base b/test/dynamo_expected_failures/TestFlag.test_writeable_any_base
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIndexing.test_broken_sequence_not_nd_index b/test/dynamo_expected_failures/TestIndexing.test_broken_sequence_not_nd_index
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_transfer_parametrizations_and_params_single_param_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_transfer_parametrizations_and_params_single_param_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPrivateUse1.test_backend_type_methods b/test/dynamo_expected_failures/TestPrivateUse1.test_backend_type_methods
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonDispatch.test_maybe_tuple_bug b/test/dynamo_expected_failures/TestPythonDispatch.test_maybe_tuple_bug
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonDispatch.test_set_data b/test/dynamo_expected_failures/TestPythonDispatch.test_set_data
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonDispatch.test_wrapper_subclass_extra_dispatch_keys b/test/dynamo_expected_failures/TestPythonDispatch.test_wrapper_subclass_extra_dispatch_keys
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestStatelessFunctionalAPI.test_functional_call_member_reference_stateless b/test/dynamo_expected_failures/TestStatelessFunctionalAPI.test_functional_call_member_reference_stateless
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestStatelessFunctionalAPI.test_functional_call_member_reference_torch_func b/test/dynamo_expected_failures/TestStatelessFunctionalAPI.test_functional_call_member_reference_torch_func
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTestParametrizationDeviceTypeCPU.test_modules_decorator_applies_module_and_param_specific_decorators_cpu b/test/dynamo_expected_failures/TestTestParametrizationDeviceTypeCPU.test_modules_decorator_applies_module_and_param_specific_decorators_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTestParametrizationDeviceTypeCPU.test_ops_composition_names_cpu b/test/dynamo_expected_failures/TestTestParametrizationDeviceTypeCPU.test_ops_composition_names_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTestParametrizationDeviceTypeCPU.test_ops_decorator_applies_op_and_param_specific_decorators_cpu b/test/dynamo_expected_failures/TestTestParametrizationDeviceTypeCPU.test_ops_decorator_applies_op_and_param_specific_decorators_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_as_subclass b/test/dynamo_expected_failures/TestTorch.test_as_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_storage_cycle_via_slots b/test/dynamo_expected_failures/TestTorch.test_storage_cycle_via_slots
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_storage_finalizer_dealloc b/test/dynamo_expected_failures/TestTorch.test_storage_finalizer_dealloc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_storage_slot_dealloc b/test/dynamo_expected_failures/TestTorch.test_storage_slot_dealloc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_tensor_cycle_via_slots b/test/dynamo_expected_failures/TestTorch.test_tensor_cycle_via_slots
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_tensor_finalizer_dealloc b/test/dynamo_expected_failures/TestTorch.test_tensor_finalizer_dealloc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_tensor_slot_dealloc b/test/dynamo_expected_failures/TestTorch.test_tensor_slot_dealloc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchFunctionMode.test_custom_device_type b/test/dynamo_expected_failures/TestTorchFunctionMode.test_custom_device_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchFunctionMode.test_disable_enable_subclass b/test/dynamo_expected_failures/TestTorchFunctionMode.test_disable_enable_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchFunctionMode.test_disable_subclass_mode b/test/dynamo_expected_failures/TestTorchFunctionMode.test_disable_subclass_mode
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchFunctionMode.test_factory_override b/test/dynamo_expected_failures/TestTorchFunctionMode.test_factory_override
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchFunctionOverride.test_pow_rpow b/test/dynamo_expected_failures/TestTorchFunctionOverride.test_pow_rpow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestAutograd.test_naughty_anomaly_access b/test/dynamo_skips/TestAutograd.test_naughty_anomaly_access
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestPythonPytree.test_key_str b/test/dynamo_skips/TestPythonPytree.test_key_str
new file mode 100644
index 0000000000000..a8d6b4d65e03c
--- /dev/null
+++ b/test/dynamo_skips/TestPythonPytree.test_key_str
@@ -0,0 +1 @@
+Passes under python 3.10, fails under 3.13
diff --git a/test/dynamo_skips/TestTorchFunctionMode.test_disable_subclass_not_mode b/test/dynamo_skips/TestTorchFunctionMode.test_disable_subclass_not_mode
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/export/test_db.py b/test/export/test_db.py
index a035bdd239167..7b7dd91c009ad 100644
--- a/test/export/test_db.py
+++ b/test/export/test_db.py
@@ -4,12 +4,13 @@
 import unittest
 
 import torch._dynamo as torchdynamo
+from torch._export import config
 from torch._export.db.case import ExportCase, SupportLevel
 from torch._export.db.examples import (
     filter_examples_by_support_level,
     get_rewrite_cases,
 )
-from torch.export import export_for_training
+from torch.export import export
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_WINDOWS,
@@ -35,13 +36,14 @@ def test_exportdb_supported(self, name: str, case: ExportCase) -> None:
         kwargs_export = case.example_kwargs
         args_model = copy.deepcopy(args_export)
         kwargs_model = copy.deepcopy(kwargs_export)
-        exported_program = export_for_training(
-            model,
-            args_export,
-            kwargs_export,
-            dynamic_shapes=case.dynamic_shapes,
-            strict=True,
-        )
+        with config.patch(use_new_tracer_experimental=True):
+            exported_program = export(
+                model,
+                case.example_args,
+                case.example_kwargs,
+                dynamic_shapes=case.dynamic_shapes,
+                strict=True,
+            )
         exported_program.graph_module.print_readable()
 
         self.assertEqual(
@@ -68,13 +70,14 @@ def test_exportdb_not_supported(self, name: str, case: ExportCase) -> None:
         with self.assertRaises(
             (torchdynamo.exc.Unsupported, AssertionError, RuntimeError)
         ):
-            export_for_training(
-                model,
-                case.example_args,
-                case.example_kwargs,
-                dynamic_shapes=case.dynamic_shapes,
-                strict=True,
-            )
+            with config.patch(use_new_tracer_experimental=True):
+                _ = export(
+                    model,
+                    case.example_args,
+                    case.example_kwargs,
+                    dynamic_shapes=case.dynamic_shapes,
+                    strict=True,
+                )
 
     exportdb_not_supported_rewrite_cases = [
         (name, rewrite_case)
@@ -94,7 +97,7 @@ def test_exportdb_not_supported_rewrite(
             self, name: str, rewrite_case: ExportCase
         ) -> None:
             # pyre-ignore
-            export_for_training(
+            export(
                 rewrite_case.model,
                 rewrite_case.example_args,
                 rewrite_case.example_kwargs,
diff --git a/test/export/test_experimental.py b/test/export/test_experimental.py
index 871dc813a687f..f47e787599f6e 100644
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@@ -9,7 +9,7 @@
 import torch._dynamo
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._functorch.aot_autograd import aot_export_module
-from torch.export import export, export_for_training
+from torch.export import export
 from torch.export.experimental import _export_forward_backward, _sticky_export
 from torch.export.graph_signature import OutputKind
 from torch.testing import FileCheck
@@ -32,7 +32,7 @@ def forward(self, x):
         m = Module()
         example_inputs = (torch.randn(3),)
         m(*example_inputs)
-        ep = torch.export.export_for_training(m, example_inputs, strict=True)
+        ep = torch.export.export(m, example_inputs, strict=True)
         joint_ep = _export_forward_backward(ep)
         self.assertExpectedInline(
             str(joint_ep.graph_module.code).strip(),
@@ -141,7 +141,7 @@ def forward(self, x):
         m = Module()
         example_inputs = (torch.randn(3),)
         m(*example_inputs)
-        ep = torch.export.export_for_training(
+        ep = torch.export.export(
             m, example_inputs, dynamic_shapes={"x": {0: Dim("x0")}}, strict=True
         )
         _export_forward_backward(ep)
@@ -177,7 +177,7 @@ def forward(self, x, labels):
         labels = torch.ones(4, dtype=torch.int64)
         inputs = (x, labels)
 
-        ep = export_for_training(net, inputs, strict=True)
+        ep = export(net, inputs, strict=True)
         ep = _export_forward_backward(ep)
 
     def test_joint_loss_index(self):
@@ -197,7 +197,7 @@ def forward(self, x):
 
         inputs = (torch.randn(4, 4),)
         for i in [0, 1]:
-            ep = export_for_training(Foo(i), inputs, strict=True)
+            ep = export(Foo(i), inputs, strict=True)
             ep_joint = _export_forward_backward(ep, joint_loss_index=i)
             for j, spec in enumerate(ep_joint.graph_signature.output_specs):
                 if i == j:
diff --git a/test/export/test_export.py b/test/export/test_export.py
index feb85e59556ef..dc7738e07b229 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -26,8 +26,10 @@
 from functorch.experimental.control_flow import cond, map
 from torch import Tensor
 from torch._decomp import decomposition_table, get_decompositions
+from torch._dynamo._trace_wrapped_higher_order_op import mod_index
 from torch._dynamo.test_case import TestCase
 from torch._dynamo.testing import normalize_gm
+from torch._export import config
 from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
 from torch._export.utils import (
     get_buffer,
@@ -42,13 +44,7 @@
 from torch._higher_order_ops.while_loop import while_loop
 from torch._inductor.compile_fx import split_const_gm
 from torch._subclasses import FakeTensorMode
-from torch.export import (
-    default_decompositions,
-    Dim,
-    export,
-    export_for_training,
-    unflatten,
-)
+from torch.export import default_decompositions, Dim, export, unflatten
 from torch.export._trace import (
     _export,
     _export_to_torch_ir,
@@ -217,12 +213,19 @@ class Inp3:
 TRAINING_IR_DECOMP_NON_STRICT_SUFFIX = "_training_ir_to_decomp_nonstrict"
 CPP_RUNTIME_STRICT_SUFFIX = "_cpp_runtime_strict"
 CPP_RUNTIME_NONSTRICT_SUFFIX = "_cpp_runtime_nonstrict"
+STRICT_EXPORT_V2_SUFFIX = "_strict_export_v2"
 
 
 # Now default mode is non strict, so original unammended test names
 # should be treated as non-strict
 def is_non_strict_test(test_name):
-    return not test_name.endswith(STRICT_SUFFIX)
+    return not test_name.endswith(STRICT_SUFFIX) and not test_name.endswith(
+        STRICT_EXPORT_V2_SUFFIX
+    )
+
+
+def is_strict_v2_test(test_name):
+    return test_name.endswith(STRICT_EXPORT_V2_SUFFIX)
 
 
 def is_inline_and_install_strict_test(test_name: str) -> bool:
@@ -617,6 +620,22 @@ def forward(self, x):
 
         self.assertEqual(counter, 1)
 
+    @testing.expectedFailureSerDer  # can't serialize functorch ops
+    @testing.expectedFailureSerDerNonStrict  # can't serialize functorch ops
+    def test_vmap_to_assert(self):
+        class VmapToAssert(torch.nn.Module):
+            def forward(self, x, y):
+                f = lambda x, y: (
+                    (x * y).to("cpu", memory_format=torch.channels_last) + 1
+                ).sum(dim=0)  # noqa: E731
+                vmapped = torch.vmap(f)(x, y)
+                return vmapped.sum(dim=0)
+
+        ep = export(VmapToAssert(), (torch.zeros(4, 4, 4, 4), torch.zeros(4, 4, 4, 4)))
+        exported = ep.module()(torch.ones(4, 4, 4, 4), torch.ones(4, 4, 4, 4))
+        eager = VmapToAssert()(torch.ones(4, 4, 4, 4), torch.ones(4, 4, 4, 4))
+        self.assertEqual(exported, eager)
+
     def test_from_node_metadata_export(self):
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -1055,7 +1074,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         args = (torch.randn(15, 3, 256, 256), torch.ones(15, 32, 256, 256))
         self.assertEqual(exported_program.module()(*args), m(*args))
 
-        gm: torch.fx.GraphModule = torch.export.export_for_training(
+        gm: torch.fx.GraphModule = torch.export.export(
             m, args=example_args, dynamic_shapes=dynamic_shapes
         ).module()
 
@@ -1351,7 +1370,7 @@ def hacked_up_forward(self_, x, y):
                 self.mod.forward = hacked_up_forward.__get__(self.mod, Foo)
 
             def __call__(self, x, y):
-                ep = torch.export.export(self.mod, (x, y), strict=True).module()
+                ep = export(self.mod, (x, y), strict=True).module()
                 out = ep(x, y)
                 return out
 
@@ -1360,13 +1379,31 @@ def update(self):
 
         foo = Foo()
         ref = ReferenceControl(foo)
-        with self.assertWarnsRegex(
-            UserWarning,
-            "While exporting, we found certain side effects happened in the model.forward. "
-            "Here are the list of potential sources you can double check: "
-            "\[\"L\['global_list'\]\", \"L\['self'\].bank\", \"L\['self'\].bank_dict\"",
-        ):
-            ref(torch.randn(4, 4), torch.randn(4, 4))
+        # TODO (tmanlaibaatar) this kinda sucks but today there is no good way to get
+        # good source name. We should have an util that post processes dynamo source names
+        # to be more readable.
+        if is_strict_v2_test(self._testMethodName):
+            with self.assertWarnsRegex(
+                UserWarning,
+                r"(L\['self']\._export_root\.forward\.__func__\.__closure__\[1\]\.cell_contents\.bank"
+                r"|L\['self']\._export_root\.forward\.__func__\.__closure__\[1\]\.cell_contents\.bank_dict"
+                r"|L\['self']\._export_root\.forward\.__func__\.__closure__\[0\]\.cell_contents)",
+            ):
+                ref(torch.randn(4, 4), torch.randn(4, 4))
+        elif is_inline_and_install_strict_test(self._testMethodName):
+            with self.assertWarnsRegex(
+                UserWarning,
+                r"(L\['self']\._modules\['_export_root']\.forward\.__func__\.__closure__\[1\]\.cell_contents\.bank"
+                r"|L\['self']\._modules\['_export_root']\.forward\.__func__\.__closure__\[1\]\.cell_contents\.bank_dict"
+                r"|L\['self']\._modules\['_export_root']\.forward\.__func__\.__closure__\[0\]\.cell_contents)",
+            ):
+                ref(torch.randn(4, 4), torch.randn(4, 4))
+        else:
+            with self.assertWarnsRegex(
+                UserWarning,
+                r"(L\['global_list'\]|L\['self'\]\.bank|L\['self'\]\.bank_dict)",
+            ):
+                ref(torch.randn(4, 4), torch.randn(4, 4))
 
     def test_mask_nonzero_static(self):
         class TestModule(torch.nn.Module):
@@ -1439,6 +1476,40 @@ def forward(self, x, y):
         ep = export(f, args, strict=False)
         self.assertEqual(ep.module()(*args), f(*args))
 
+    def test_where_decomp(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.ops.aten.where.default(x > 0)
+
+        test_module = TestModule()
+        sample_input = (torch.randn(2, 3),)
+
+        def auto_dynamic_shapes_from_args(args):  # pyre-ignore
+            """
+            This function creates dynamic shapes specification with Dim.AUTO
+            in all dimensions of all tensors for given argument list.
+            """
+            if isinstance(args, list):
+                return [auto_dynamic_shapes_from_args(arg) for arg in args]
+            elif isinstance(args, tuple):
+                return tuple(auto_dynamic_shapes_from_args(arg) for arg in args)
+            elif isinstance(args, dict):
+                return {k: auto_dynamic_shapes_from_args(v) for k, v in args.items()}
+            elif isinstance(args, torch.Tensor):
+                return {j: Dim.AUTO for j in range(args.dim())}
+            else:
+                print(f"args type: {type(args)}")
+                return None
+
+        ep = torch.export.export(
+            test_module,
+            sample_input,
+            dynamic_shapes=auto_dynamic_shapes_from_args(sample_input),
+        ).run_decompositions({})
+
     def test_basic_non_strict_fake_tensor(self):
         class Basic(torch.nn.Module):
             def __init__(self) -> None:
@@ -1709,22 +1780,12 @@ def forward(self, x, trigger, target):
         trigger = 0
         target = 2
         args = (x, trigger, target)
-        ep = export(m, args, dynamic_shapes=(None, Dim.DYNAMIC, Dim.DYNAMIC))
-        if is_training_ir_strict_test(self._testMethodName):
-            # In strict mode export's result capturing compiler, we create
-            # 2 new symints when re-fakifying the symint inputs.
-            # Then in run_decompositions, ep.range_constraints was updated
-            # where it checks the var_to_range and put the two newly added ones into the range_constraints.
-            self.assertExpectedInline(
-                str(tuple(ep.range_constraints.values())),
-                """(VR[0, int_oo], VR[0, int_oo], VR[-int_oo, int_oo], VR[-int_oo, int_oo])""",
-            )
-        else:
+        with config.patch(use_new_tracer_experimental=True):
+            ep = export(m, args, dynamic_shapes=(None, Dim.DYNAMIC, Dim.DYNAMIC))
             self.assertExpectedInline(
                 str(tuple(ep.range_constraints.values())),
                 """(VR[0, int_oo], VR[0, int_oo])""",
             )
-
         self.assertEqual(m(*args), ep.module()(*args))
 
     def test_cond_branches_return_same_int(self):
@@ -2453,7 +2514,7 @@ def forward(self, x):
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
 
-        ep_training = torch.export.export_for_training(m, (ref_x,))
+        ep_training = torch.export.export(m, (ref_x,))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -2516,7 +2577,7 @@ def forward(self, x):
         ref_x = torch.randn(2, 2)
         ref_out = m(ref_x)
 
-        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+        ep_training = torch.export.export(m, (ref_x,), strict=False)
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -2648,7 +2709,7 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
-        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+        ep_training = torch.export.export(m, (ref_x,), strict=False)
         self.assertTrue(torch.allclose(ep_training.module()(ref_x), ref_out))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
@@ -2703,7 +2764,7 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
-        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+        ep_training = torch.export.export(m, (ref_x,), strict=False)
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -2743,7 +2804,7 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
-        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+        ep_training = torch.export.export(m, (ref_x,), strict=False)
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -2781,7 +2842,7 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
-        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+        ep_training = torch.export.export(m, (ref_x,), strict=False)
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -2820,7 +2881,7 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
-        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+        ep_training = torch.export.export(m, (ref_x,), strict=False)
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -2860,7 +2921,7 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
-        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+        ep_training = torch.export.export(m, (ref_x,), strict=False)
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -3980,7 +4041,7 @@ def forward(self, x, y):
                 x_linear = self.linear(x_conv)
                 return x_linear.cos() + y_conv_1d.sum()
 
-        ep = torch.export.export_for_training(
+        ep = torch.export.export(
             Foo(), (torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50))
         )
 
@@ -4248,9 +4309,7 @@ def forward(self, x):
                 return self.linear(x)
 
         eager_model = Foo()
-        ep_for_training = torch.export.export_for_training(
-            eager_model, (torch.ones(2, 2),)
-        )
+        ep_for_training = torch.export.export(eager_model, (torch.ones(2, 2),))
         self.assertExpectedInline(
             str(ep_for_training.graph_module.code).strip(),
             """\
@@ -4288,7 +4347,7 @@ def forward(self, x):
 
         eager_model_for_export = Foo()
         eager_model_for_testing = Foo()
-        ep_for_training = torch.export.export_for_training(
+        ep_for_training = torch.export.export(
             eager_model_for_export, (torch.ones(4, 4),)
         )
         self.assertExpectedInline(
@@ -4334,7 +4393,7 @@ def forward(self, x):
         eager_model_for_export_training = Foo()
         eager_model_for_export_inference = Foo()
         eager_model_for_testing = Foo()
-        ep_for_training = torch.export.export_for_training(
+        ep_for_training = torch.export.export(
             eager_model_for_export_training,
             (torch.ones(4, 4),),
             dynamic_shapes=({0: Dim("x")},),
@@ -4388,7 +4447,7 @@ def forward(self, container):
                 return x + y + self.buffer.sum()
 
         eager_model = Foo()
-        ep_for_training = torch.export.export_for_training(
+        ep_for_training = torch.export.export(
             eager_model,
             ([torch.ones(4, 4), torch.ones(4, 4)],),
         )
@@ -4416,17 +4475,8 @@ def closure():
                 global_storage.append(closure)
                 return x.sin()
 
-        prev_os_env = os.environ.copy()
-        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
-
-        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
-
         with (
-            patch.dict(
-                os.environ,
-                prev_os_env,
-                clear=True,
-            ),
+            torch._export.config.patch(detect_non_strict_fake_tensor_leaks=True),
             self.assertWarnsRegex(
                 UserWarning, "Detected 1 fake tensors that are still alive after export"
             ),
@@ -4472,17 +4522,8 @@ def update(self):
             isinstance(ref.bank[0], torch._subclasses.fake_tensor.FakeTensor)
         )
 
-        prev_os_env = os.environ.copy()
-        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
-
-        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
-
         with (
-            patch.dict(
-                os.environ,
-                prev_os_env,
-                clear=True,
-            ),
+            torch._export.config.patch(detect_non_strict_fake_tensor_leaks=True),
             self.assertWarnsRegex(
                 UserWarning, "Detected 3 fake tensors that are still alive after export"
             ),
@@ -4507,16 +4548,7 @@ def forward(self, x, y):
             isinstance(global_list[0], torch._subclasses.fake_tensor.FakeTensor)
         )
 
-        prev_os_env = os.environ.copy()
-        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
-
-        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
-
-        with patch.dict(
-            os.environ,
-            prev_os_env,
-            clear=True,
-        ):
+        with torch._export.config.patch(detect_non_strict_fake_tensor_leaks=True):
             warn_re = re.compile(
                 r"Detected\s+\d+\s+fake\s+tensors?"
                 r".*test_export\.py.*global_list\.append\(x \+ y\)",
@@ -4563,16 +4595,7 @@ def forward(self, x, y):
         self.assertIsNotNone(node1_ref(), "node1 should still be alive due to cycle")
         self.assertIsNotNone(node2_ref(), "node2 should still be alive due to cycle")
 
-        prev_os_env = os.environ.copy()
-        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
-
-        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
-
-        with patch.dict(
-            os.environ,
-            prev_os_env,
-            clear=True,
-        ):
+        with torch._export.config.patch(detect_non_strict_fake_tensor_leaks=True):
             warn_re = re.compile(
                 r"Detected\s+\d+\s+fake\s+tensors?"
                 r'.*?[/\\]test_export\.py",\s+line\s+\d+,\s+in\s+forward'
@@ -4594,7 +4617,7 @@ def forward(self, x):
                 return self.linear(x) + self.buffer.sum()
 
         eager_model = Foo()
-        ep_for_training = torch.export.export_for_training(
+        ep_for_training = torch.export.export(
             eager_model,
             (torch.ones(2, 2),),
         )
@@ -4753,6 +4776,26 @@ def forward(self, x, y):
         self.assertEqual(range_lower_bounds, [1, 2])
         self.assertEqual(range_upper_bounds, [2, 3])
 
+    def test_issue_161902(self):
+        class Add(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        m = Add()
+        x = torch.randn(2, 3)
+        y = torch.randn(2, 3)
+
+        dx = Dim("dx", min=1, max=2)
+        conflicting = {"x": (2 * dx, Dim.STATIC), "y": (dx + 1, Dim.STATIC)}
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            r"Constraints violated.*"
+            r"\n.*You marked 2\*dx as dynamic but your code specialized it to be a constant \(2\).*"
+            r"\n.*You marked dx \+ 1 as dynamic but your code specialized it to be a constant \(2\).*",
+        ):
+            export(m, (x, y), dynamic_shapes=conflicting)
+
     def test_range_constraints_with_replacement(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -4831,6 +4874,47 @@ def forward(self, x, repeat):
         exported = export(model, inputs).module()
         self.assertEqual(model(*inputs), exported(*inputs))
 
+    def test_dynamic_shapes_wrapped_with_shape_guards(self):
+        class Neuron(torch.nn.Module):
+            def __init__(self, n_dims: int = 5, n_targets: int = 3):
+                super().__init__()
+                self.linear = torch.nn.Linear(n_dims, n_targets)
+
+            def forward(self, x, y):
+                return torch.sigmoid(self.linear(x + y))
+
+        args = (torch.randn(2, 5), torch.randn(2, 5))
+        batch = torch.export.Dim.DYNAMIC
+
+        n = Neuron()
+        compiled = export(n, args, dynamic_shapes=({0: batch}, {0: batch}))
+        expected = n(*args)
+        mod = compiled.module()
+        got = mod(*args)
+        self.assertTrue(torch.allclose(expected, got))
+
+        class Wrapped(Neuron):
+            def forward(self, *args):
+                return super().forward(*args)
+
+        w = Wrapped()
+
+        if is_retracebility_test(self._testMethodName):
+            with self.assertRaisesRegex(
+                torch._dynamo.exc.UserError,
+                "Detected mismatch between the structure of `inputs` and `dynamic_shapes`"
+                ": `inputs` has 2 elements, but `dynamic_shapes` has 1 elements",
+            ):
+                export(w, args, dynamic_shapes={"args": ({0: batch}, {0: batch})})
+        else:
+            compiled = export(
+                w, args, dynamic_shapes={"args": ({0: batch}, {0: batch})}
+            )
+            expected = w(*args)
+            mod = compiled.module()
+            got = mod(*args)
+            self.assertTrue(torch.allclose(expected, got))
+
     def test_dynamic_shapes_builder_basic(self):
         class M(torch.nn.Module):
             def forward(self, x, y, z):
@@ -7527,7 +7611,7 @@ def forward(self, x):
 
         inp = torch.randn(4, 4)
 
-        ep = export_for_training(
+        ep = torch.export.export(
             Foo(), (inp,), strict=False, preserve_module_call_signature=("bar",)
         )
         unflat = unflatten(ep).bar
@@ -7833,7 +7917,7 @@ def forward(self, x):
 
         decomp_table = {**default_decompositions(), **decomposition_table}
 
-        ep = export_for_training(M(), (torch.randn(2, 2),)).run_decompositions(
+        ep = torch.export.export(M(), (torch.randn(2, 2),)).run_decompositions(
             decomp_table
         )
 
@@ -7862,7 +7946,7 @@ def forward(self, x):
         mod.eval()
         inp = torch.randn(1, 1, 3, 3)
 
-        gm = torch.export.export_for_training(mod, (inp,)).module()
+        gm = torch.export.export(mod, (inp,)).module()
         self.assertExpectedInline(
             str(gm.code).strip(),
             """\
@@ -7882,7 +7966,7 @@ def forward(self, x):
         )
 
         mod.train()
-        gm_train = torch.export.export_for_training(mod, (inp,)).module()
+        gm_train = torch.export.export(mod, (inp,)).module()
         self.assertExpectedInline(
             str(gm_train.code).strip(),
             """\
@@ -8447,7 +8531,7 @@ def forward(self, x, m):
         ref_x = torch.randn(2, 2)
         ref_out = f(ref_x, mod)
 
-        ep = torch.export.export_for_training(f, (torch.randn(2, 2), mod), strict=False)
+        ep = torch.export.export(f, (torch.randn(2, 2), mod), strict=False)
         self.assertEqual(ref_out, ep.module()(ref_x, mod))
 
     def test_unbacked_noncontig_lin(self):
@@ -9642,7 +9726,7 @@ def forward(self, x):
                 return m(x) * x
 
         inps = (torch.randn(3, 3),)
-        ep = export_for_training(M2(), inps).run_decompositions({})
+        ep = torch.export.export(M2(), inps).run_decompositions({})
         self.assertTrue(torch.allclose(ep.module()(*inps), M2()(*inps)))
 
         self.assertEqual(len(ep.state_dict), 0)
@@ -9679,7 +9763,7 @@ def forward(self, x):
 
         inps = (torch.randn(3, 3),)
         # Strict export segfaults (Issue #128109)
-        ep = export_for_training(M2(), inps, strict=False).run_decompositions({})
+        ep = torch.export.export(M2(), inps, strict=False).run_decompositions({})
         self.assertTrue(torch.allclose(ep.module()(*inps), M2()(*inps)))
 
         self.assertEqual(len(ep.state_dict), 0)
@@ -9987,7 +10071,7 @@ def forward(self):
                 return (torch.full((i0,), 0.0),)
 
         f = M()
-        ep = torch.export.export(f, ())
+        ep = export(f, ())
         a = ep.module()()[0]
         self.assertEqual(a.size(), torch.Size([11]))
         self.assertEqual(a, torch.zeros(11))
@@ -12010,7 +12094,7 @@ def test(ep):
 
         if is_training_ir_test(self._testMethodName):
             test(
-                torch.export.export_for_training(
+                torch.export.export(
                     M(),
                     inp,
                     strict=not is_non_strict_test(self._testMethodName),
@@ -12131,7 +12215,7 @@ def test(ep, swap=None):
         test(export(M(), inp))
 
         strict = not is_non_strict_test(self._testMethodName)
-        ept = torch.export.export_for_training(
+        ept = torch.export.export(
             M(),
             inp,
             strict=strict,
@@ -12206,7 +12290,7 @@ def forward(self, x):
 
         x = torch.zeros((4, 4, 10))
 
-        ep_training = torch.export.export_for_training(model, (x,), strict=False)
+        ep_training = torch.export.export(model, (x,), strict=False)
         state_dict_before = ep_training.state_dict
 
         ep = export(model, (x,), strict=False).run_decompositions()
@@ -12250,7 +12334,7 @@ def forward(self, x):
 
         x = torch.zeros((4, 4, 10))
 
-        ep_training = torch.export.export_for_training(model, (x,), strict=False)
+        ep_training = torch.export.export(model, (x,), strict=False)
         state_dict_before = ep_training.state_dict
 
         ep = export(model, (x,), strict=False).run_decompositions()
@@ -12769,7 +12853,7 @@ def true_fn(x, y):
 
         model = Model()
         with torch.no_grad():
-            exported_program = torch.export.export_for_training(
+            exported_program = torch.export.export(
                 model,
                 (torch.tensor(10), torch.tensor(12)),
                 {},
@@ -12865,7 +12949,7 @@ def forward(self, x, y):
         # no grad
         model = Model()
         with torch.no_grad():
-            ep_nograd = torch.export.export_for_training(
+            ep_nograd = torch.export.export(
                 model,
                 (torch.tensor(10), torch.tensor(12)),
                 {},
@@ -12885,7 +12969,7 @@ def forward(self, x, y):
 
         # enable grad
         model = Model()
-        ep_grad = torch.export.export_for_training(
+        ep_grad = torch.export.export(
             model,
             (torch.tensor(10), torch.tensor(12)),
             {},
@@ -12989,6 +13073,8 @@ def _test(m, non_persistent_buffer):
         _test(MyModule(), "foo")
         _test(MyOuterModule(), "inner.foo")
 
+    @testing.expectedFailureTrainingIRToRunDecomp  # set_grad disappears after decomp
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict  # set_grad disappears after decomp
     def test_export_with_set_grad_enabled(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -13001,18 +13087,9 @@ def forward(self, x):
 
         model = Model()
         ep = export(model, (torch.randn(4, 4),), {})
-        # _export_for_traininig is using pre_dispatch=False
-        # Therefore the set_grad calls are not replaced with a hop.
-        if not is_training_ir_test(self._testMethodName):
-            self.assertIn(
-                "torch.ops.higher_order.wrap_with_set_grad_enabled",
-                ep.graph_module.code,
-            )
-        gm = torch.export.export_for_training(model, (torch.randn(4, 4),)).module()
-        self.assertIn(
-            "set_grad_enabled",
-            gm.code,
-        )
+        FileCheck().check_count(
+            "torch.ops.higher_order.wrap_with_set_grad_enabled", 1, exactly=True
+        ).run(ep.graph_module.code)
 
     def test_export_with_autocast(self):
         class Model(torch.nn.Module):
@@ -13037,7 +13114,7 @@ def forward(self, x):
             )
         # _export_for_traininig is using pre_dispatch=False
         # Therefore the autocast calls are not replaced with a hop.
-        gm = torch.export.export_for_training(model, (torch.randn(4, 4),)).module()
+        gm = torch.export.export(model, (torch.randn(4, 4),)).module()
         self.assertIn(
             "autocast",
             gm.code,
@@ -13284,7 +13361,7 @@ def forward(self, x):
 
         inps = (torch.ones(5),)
 
-        ep = export_for_training(M(), inps).run_decompositions({})
+        ep = torch.export.export(M(), inps).run_decompositions({})
         self.assertExpectedInline(
             str(ep.graph_module.code.strip()),
             """\
@@ -13592,6 +13669,52 @@ def forward(self, x):
         ):
             _ = export(Foo(), (torch.randn(4, 4),), strict=False)
 
+    def test_vmap_custom_autograd_function(self):
+        from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+
+        class IndexingModule(torch.nn.Module):
+            def __init__(self, base_size: int = 10):
+                super().__init__()
+                self.register_buffer("base", torch.arange(base_size))
+
+            def forward(self, indices: torch.Tensor) -> torch.Tensor:
+                with TransformGetItemToIndex():
+                    # Each element of `indices` is a scalar tensor, so our override kicks in
+                    return torch.vmap(lambda i: self.base[i])(indices)
+
+        m = IndexingModule(10)
+        idxs = torch.tensor([0, 3, 7, 9])
+        ep = torch.export.export(m, (idxs,), strict=False)
+        self.assertExpectedInline(
+            ep.graph,
+            """\
+graph():
+    %b_base : [num_users=1] = placeholder[target=b_base]
+    %indices : [num_users=1] = placeholder[target=indices]
+    %lazy_load_decompositions : [num_users=0] = call_function[target=torch._functorch.predispatch.lazy_load_decompositions](args = (), kwargs = {})
+    %_vmap_increment_nesting : [num_users=0] = call_function[target=torch._functorch.predispatch._vmap_increment_nesting](args = (4, error), kwargs = {})
+    %_add_batch_dim : [num_users=1] = call_function[target=torch._functorch.predispatch._add_batch_dim](args = (%indices, 0, 1), kwargs = {})
+    %torch__dynamo__trace_wrapped_higher_order_op_mod_index0 : [num_users=1] = get_attr[target=torch__dynamo__trace_wrapped_higher_order_op_ModIndex0]
+    %function_const_func_spec0 : [num_users=1] = get_attr[target=function_const_func_spec0]
+    %flat_apply : [num_users=1] = call_function[target=torch.ops.higher_order.flat_apply](args = (%function_const_func_spec0, %torch__dynamo__trace_wrapped_higher_order_op_mod_index0, torch._dynamo._trace_wrapped_higher_order_op.ModIndex, %b_base, %_add_batch_dim), kwargs = {})
+    %_remove_batch_dim : [num_users=1] = call_function[target=torch._functorch.predispatch._remove_batch_dim](args = (%flat_apply, 1, 4, 0), kwargs = {})
+    %_vmap_decrement_nesting : [num_users=0] = call_function[target=torch._functorch.predispatch._vmap_decrement_nesting](args = (), kwargs = {})
+    return (_remove_batch_dim,)""",
+        )
+
+        self.assertEqual(m(idxs), ep.module()(idxs))
+        ep = ep.run_decompositions({})
+        self.assertExpectedInline(
+            ep.graph,
+            """\
+graph():
+    %b_base : [num_users=1] = placeholder[target=b_base]
+    %indices : [num_users=1] = placeholder[target=indices]
+    %index : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%b_base, [%indices]), kwargs = {})
+    return (index,)""",
+        )
+        self.assertEqual(m(idxs), ep.module()(idxs))
+
     def test_unbacked_deferred_runtime_retrace(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -13605,7 +13728,7 @@ def forward(self, x, y):
                 return y + y_sum + unbacked_shape.sum()
 
         inps = (torch.tensor(4), torch.randn(5, 5))
-        ep_pre = torch.export.export_for_training(Foo(), inps, strict=False)
+        ep_pre = torch.export.export(Foo(), inps, strict=False)
         self.assertExpectedInline(
             str(ep_pre.graph_module.submod_1.code).strip(),
             """\
@@ -13688,21 +13811,24 @@ def forward(self, x):
 
         inputs = (torch.randn(10, 72),)
         dx, dy = dims("dx", "dy")
-        ep = torch.export.export(
-            Mod4Reshape(),
-            inputs,
-            dynamic_shapes={"x": (dx, dy)},
-            prefer_deferred_runtime_asserts_over_guards=True,
-        )
-        out1 = ep.module()(torch.randn(8, 7))
-        self.assertEqual(out1.shape, torch.ones(7, 4, 2).shape)
-        out2 = ep.module()(torch.randn(12, 11))
-        self.assertEqual(out2.shape, torch.ones(11, 4, 3).shape)
-        with self.assertRaisesRegex(
-            RuntimeError,
-            r"Runtime assertion failed for expression Eq\(Mod\(s27\*s77, 4\*s77 \- 4\), 0\) on node 'eq.*'",
-        ):
-            ep.module()(torch.randn(8, 8))  # fail
+        for use_new_tracer in [True, False]:
+            with torch._export.config.patch(use_new_tracer_experimental=use_new_tracer):
+                ep = torch.export._trace._export(
+                    Mod4Reshape(),
+                    inputs,
+                    dynamic_shapes={"x": (dx, dy)},
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                    pre_dispatch=True,
+                )
+            out1 = ep.module()(torch.randn(8, 7))
+            self.assertEqual(out1.shape, torch.ones(7, 4, 2).shape)
+            out2 = ep.module()(torch.randn(12, 11))
+            self.assertEqual(out2.shape, torch.ones(11, 4, 3).shape)
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"^Runtime assertion failed for expression Eq\(Mod\(s\d+\*s\d+, 4\*s\d+\s*-\s*4\), 0\) on node 'eq[^']*'$",
+            ):
+                ep.module()(torch.randn(8, 8))  # fail
 
         # case 2: 2d reshape
         class FreeReshape(torch.nn.Module):
@@ -14295,7 +14421,7 @@ def forward(self, x):
                 return val.b.a
 
         mod = Foo()
-        ep = export_for_training(mod, (torch.randn(4, 4),), strict=False)
+        ep = torch.export.export(mod, (torch.randn(4, 4),), strict=False)
         self.assertExpectedInline(
             str(ep.graph).strip(),
             """\
@@ -14389,10 +14515,7 @@ def __init__(self):
             def forward(self, x):
                 return x.cos()
 
-        with self.assertRaisesRegex(
-            RuntimeError, "TestExport.test_capture_subclass_wrong.<locals>.Foo"
-        ):
-            export(Foo(), (torch.randn(4, 4),))
+        export(Foo(), (torch.randn(4, 4),))
 
     def test_capture_subclass_constructor_torch_ir(self):
         class Foo(torch.nn.Module):
@@ -14536,6 +14659,14 @@ def forward(self, x, y):
         if is_inline_and_install_strict_test(self._testMethodName):
             self.assertEqual(filtered_nn_module_stack[0], "mod_list_1.2")
             self.assertEqual(filtered_nn_module_stack[1], "mod_list_1.2")
+        # This is fine since both of these will be deprecated soon.
+        elif is_strict_v2_test(self._testMethodName) and IS_FBCODE:
+            self.assertEqual(
+                filtered_nn_module_stack[0], "mod_list_1.slice(2, 3, None).0"
+            )
+            self.assertEqual(
+                filtered_nn_module_stack[1], "mod_list_2.slice(4, 5, None).0"
+            )
         else:
             self.assertEqual(
                 filtered_nn_module_stack[0], "mod_list_1.slice(2, 3, None).2"
@@ -15308,7 +15439,7 @@ def outer_body_fn(x, y):
         x = torch.randn(2, 4)
         y = torch.ones(4)
 
-        ep_for_training = torch.export.export_for_training(M(), (x, y), strict=strict)
+        ep_for_training = torch.export.export(M(), (x, y), strict=strict)
         self.assertExpectedInline(
             normalize_gm(
                 ep_for_training.graph_module.print_readable(print_output=False)
@@ -15374,6 +15505,7 @@ def forward(self, arg0_1: "f32[2, 4]", arg1_1: "f32[4]"):
         )
 
     @testing.expectedFailureStrict  # test_hop doesn't have a dynamo implementation
+    @testing.expectedFailureStrictV2  # test_hop doesn't have a dynamo implementation
     @testing.expectedFailureRetraceability  # test_hop doesn't have a dynamo implementation
     @testing.expectedFailureTrainingIRToRunDecomp  # test_hop doesn't have a dynamo implementation
     @testing.expectedFailureSerDerNonStrict  # TODO: serde torch.FunctionSchema is not implemented yet
@@ -15903,6 +16035,26 @@ def forward(self, q, k, v):
         ):
             export(Foo(), (torch.randn(1, 33, 256, 128), k, v))
 
+    def test_namedtuple_input_export(self):
+        # test for NamedTuple inputs with both strict and non-strict export modes
+        from collections import namedtuple
+
+        PointNT = namedtuple("PointNT", ["x", "y"])
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        inp = PointNT(torch.ones(3), torch.ones(3))
+
+        ep_non_strict = export(M(), inp)
+        result_non_strict = ep_non_strict.module()(*inp)
+
+        ep_strict = export(M(), inp, strict=True)
+        result_strict = ep_strict.module()(*inp)
+
+        self.assertEqual(result_non_strict, result_strict)
+
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestOneOffModelExportResult(TestCase):
@@ -16068,9 +16220,11 @@ def forward(
             ) -> torch.Tensor:
                 # x.sizes(): 1, 128, 16, 128
                 sp = start_pos.item()
-                torch._check_is_size(sp)
+
+                # Checks needed for slicing.
                 torch._check(sp >= 0)
                 torch._check(sp <= 126)
+
                 key = cache[:, : sp + 1, :, :]  # 1, sp+1, 16, 128
                 value = cache[:, : sp + 1, :, :]  # 1, sp+1, 16, 128
                 query = query.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
@@ -16491,6 +16645,37 @@ def forward(self, *args, **kwargs):
         wrapper = Wrapper(pyt_model, example_inputs)
         wrapper.forward()
 
+    def test_export_with_dict_input_nested_in_args(self):
+        """Test export with dictionary input nested in args."""
+
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+                self.linear = torch.nn.Linear(10, 1)
+
+            def forward(self, data_batch):
+                h1 = self.linear(data_batch["a1"])
+                h2 = self.linear(data_batch["a2"])
+                return h1 + h2
+
+        # Create model and example inputs
+        model = MyModel()
+        a1 = torch.randn(10)
+        a2 = torch.randn(10)
+        original_input = {"a1": a1, "a2": a2}
+        example_args_forward = (original_input,)
+
+        # Export the model
+        exported_model = export(model, example_args_forward)
+
+        # Run both models and compare results
+        reordered_input = {"a2": a2, "a1": a1}
+        original_output = exported_model.module()(reordered_input)
+        loaded_output = model(original_input)
+
+        # Verify outputs are close (allowing for floating point differences)
+        torch.testing.assert_close(original_output, loaded_output)
+
     def test_strict_export_with_shared_parameters(self):
         """Test that parameter names are preserved when there are shared parameters with the same name."""
 
diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py
index 35d8b2895bd83..888247e2eccab 100644
--- a/test/export/test_export_opinfo.py
+++ b/test/export/test_export_opinfo.py
@@ -3,6 +3,9 @@
 # flake8: noqa
 
 import itertools
+import subprocess
+import sys
+import unittest
 
 import torch
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
@@ -11,12 +14,13 @@
     ops,
 )
 from torch.testing._internal.common_methods_invocations import (
+    onlyCUDA,
     op_db,
     skip,
     skipOps,
     xfail,
 )
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import run_tests, skipIfRocm, TestCase
 from torch.utils import _pytree as pytree
 
 
@@ -82,9 +86,7 @@ def _test_export_helper(self, dtype, op):
     target_device = "cuda:1"
 
     def to_fake_device(x):
-        x = converter.from_real_tensor(mode, x)
-        x.fake_device = torch.device(target_device)
-        return x
+        return x.to(target_device)
 
     # Limit to first 100 inputs so tests don't take too long
     for sample_input in itertools.islice(sample_inputs_itr, 100):
@@ -129,8 +131,157 @@ def test_fake_export(self, device, dtype, op):
         _test_export_helper(self, dtype, op)
 
 
-only_for = "cpu"
-instantiate_device_type_tests(TestExportOpInfo, globals(), only_for=only_for)
+instantiate_device_type_tests(TestExportOpInfo, globals(), only_for="cpu")
+
+
+selected_ops = {
+    "__getitem__",
+    # "nn.functional.batch_norm",  # needs to fix
+    "nn.functional.instance_norm",
+    "nn.functional.multi_margin_loss",
+    "nonzero",
+}
+selected_op_db = [op for op in op_db if op.name in selected_ops]
+
+
+class TestExportOnFakeCuda(TestCase):
+    # In CI, this test runs on a CUDA machine with cuda build
+    # We set CUDA_VISIBLE_DEVICES="" to simulate a CPU machine with cuda build
+    # Running this on all ops in op_db is too slow, so we only run on a selected subset
+    @onlyCUDA
+    @skipIfRocm
+    @ops(selected_op_db, allowed_dtypes=(torch.float,))
+    def test_fake_export(self, device, dtype, op):
+        test_script = f"""\
+import torch
+import itertools
+from torch.testing._internal.common_methods_invocations import op_db
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.utils import _pytree as pytree
+
+ops = [op for op in op_db if op.name == "{op.name}"]
+assert len(ops) == 1
+op = ops[0]
+
+sample_inputs_itr = op.sample_inputs("cpu", torch.float, requires_grad=False)
+
+mode = FakeTensorMode(allow_non_fake_inputs=True)
+converter = mode.fake_tensor_converter
+# intentionally avoid cuda:0 to flush out some bugs
+target_device = "cuda:1"
+
+def to_fake_device(x):
+    return x.to(target_device)
+
+# Limit to first 100 inputs so tests don't take too long
+for sample_input in itertools.islice(sample_inputs_itr, 100):
+    args = tuple([sample_input.input] + list(sample_input.args))
+    kwargs = sample_input.kwargs
+
+    # hack to skip non-tensor in args, as export doesn't support it
+    if any(not isinstance(arg, torch.Tensor) for arg in args):
+        continue
+
+    if "device" in kwargs:
+        kwargs["device"] = target_device
+
+    with mode:
+        args, kwargs = pytree.tree_map_only(
+            torch.Tensor, to_fake_device, (args, kwargs)
+        )
+
+        class Module(torch.nn.Module):
+            def forward(self, *args):
+                return op.op(*args, **kwargs)
+
+        m = Module()
+
+        ep = torch.export.export(m, args)
+
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                fake_tensor = node.meta.get("val", None)
+                if isinstance(fake_tensor, FakeTensor):
+                    assert fake_tensor.device == torch.device(target_device)
+"""
+        r = (
+            (
+                subprocess.check_output(
+                    [sys.executable, "-c", test_script],
+                    env={"CUDA_VISIBLE_DEVICES": ""},
+                )
+            )
+            .decode("ascii")
+            .strip()
+        )
+        self.assertEqual(r, "")
+
+    @unittest.skipIf(not torch.backends.cuda.is_built(), "requires CUDA build")
+    @skipIfRocm
+    def test_preserve_original_behavior(self):
+        test_script = f"""\
+import torch
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+
+def cuda_calls_behavior_unchanged():
+    exception_count = 0
+
+    try:
+        cpu_x = torch.randn(2)
+        cuda_x = cpu_x.to("cuda")
+    except Exception as e:
+        exception_count += 1
+
+    try:
+        torch.randn(2, device="cuda")
+    except Exception as e:
+        exception_count += 1
+
+    try:
+        torch.cuda.get_device_capability()
+    except Exception as e:
+        exception_count += 1
+
+    try:
+        torch.cuda.set_device(1)
+    except Exception as e:
+        exception_count += 1
+
+    try:
+        torch.cuda.current_device()
+    except Exception as e:
+        exception_count += 1
+
+    assert torch.cuda.is_available() == False
+    assert torch.cuda.device_count() == 0
+    assert exception_count == 5
+
+cuda_calls_behavior_unchanged()
+
+cpu_x = torch.randn(2)
+with FakeTensorMode(allow_non_fake_inputs=True) as mode:
+    cuda_x = mode.from_tensor(cpu_x)
+    cuda_x.fake_device = torch.device("cuda")
+    cuda_y = cuda_x + cuda_x
+    assert cuda_y.device.type == "cuda"
+
+# should fail again after exiting the fake mode, with the identical error message
+cuda_calls_behavior_unchanged()
+"""
+        r = (
+            (
+                subprocess.check_output(
+                    [sys.executable, "-c", test_script],
+                    env={"CUDA_VISIBLE_DEVICES": ""},
+                )
+            )
+            .decode("ascii")
+            .strip()
+        )
+        self.assertEqual(r, "")
+
+
+instantiate_device_type_tests(TestExportOnFakeCuda, globals(), only_for="cuda")
 
 
 if __name__ == "__main__":
diff --git a/test/export/test_export_training_ir_to_run_decomp.py b/test/export/test_export_training_ir_to_run_decomp.py
index 1f5b7c952701a..6781e6d3c7d75 100644
--- a/test/export/test_export_training_ir_to_run_decomp.py
+++ b/test/export/test_export_training_ir_to_run_decomp.py
@@ -15,14 +15,14 @@
 
 def mocked_training_ir_to_run_decomp_export_strict(*args, **kwargs):
     if "strict" in kwargs:
-        ep = torch.export.export_for_training(*args, **kwargs)
+        ep = torch.export.export(*args, **kwargs)
     else:
-        ep = torch.export.export_for_training(*args, **kwargs, strict=True)
+        ep = torch.export.export(*args, **kwargs, strict=True)
     return ep.run_decompositions({})
 
 
 def mocked_training_ir_to_run_decomp_export_non_strict(*args, **kwargs):
-    ep = torch.export.export_for_training(*args, **kwargs)
+    ep = torch.export.export(*args, **kwargs)
 
     return ep.run_decompositions({})
 
diff --git a/test/export/test_export_with_inline_and_install.py b/test/export/test_export_with_inline_and_install.py
index c7f6c28038927..0894a8e684468 100644
--- a/test/export/test_export_with_inline_and_install.py
+++ b/test/export/test_export_with_inline_and_install.py
@@ -3,8 +3,9 @@
 
 import unittest
 
-from torch._dynamo import config
+from torch._dynamo import config as dynamo_config
 from torch._dynamo.testing import make_test_cls_with_patches
+from torch._export import config as export_config
 
 
 try:
@@ -44,8 +45,9 @@ def make_dynamic_cls(cls):
         cls_a,
         cls_prefix,
         "",
-        (config, "install_free_tensors", True),
-        (config, "inline_inbuilt_nn_modules", True),
+        (export_config, "use_new_tracer_experimental", True),
+        (dynamo_config, "install_free_tensors", True),
+        (dynamo_config, "inline_inbuilt_nn_modules", True),
         xfail_prop="_expected_failure_inline_and_install",
     )
 
@@ -71,7 +73,22 @@ def make_dynamic_cls(cls):
 unittest.expectedFailure(
     InlineAndInstallStrictExportTestExport.test_buffer_util_inline_and_install_strict  # noqa: F821
 )
-
+# this is because we marked unlift hooks to be dynamo skip traced
+unittest.expectedFailure(
+    InlineAndInstallStrictExportTestExport.test_custom_tag_metadata_re_export_inline_and_install_strict  # noqa: F821
+)
+unittest.expectedFailure(
+    InlineAndInstallStrictExportTestExport.test_from_node_metadata_export_inline_and_install_strict  # noqa: F821
+)
+unittest.expectedFailure(
+    InlineAndInstallStrictExportTestExport.test_module_inline_and_install_strict  # noqa: F821
+)
+unittest.expectedFailure(
+    InlineAndInstallStrictExportTestExport.test_module_with_dict_container_inp_out_inline_and_install_strict  # noqa: F821
+)
+unittest.expectedFailure(
+    InlineAndInstallStrictExportTestExport.test_retrace_pre_autograd_inline_and_install_strict  # noqa: F821
+)
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/export/test_nativert.py b/test/export/test_nativert.py
index bcbda2e42fc10..9c6fe59907c02 100644
--- a/test/export/test_nativert.py
+++ b/test/export/test_nativert.py
@@ -6,9 +6,19 @@
 import tempfile
 import unittest
 
+from parameterized import parameterized
+
 import torch
+import torch._dynamo as torchdynamo
 from torch._C._nativert import PyModelRunner
+from torch._dynamo.test_case import TestCase
 from torch._subclasses.fake_tensor import FakeTensor
+from torch.nativert.backends._lower_utils import (
+    lower_exported_program,
+    package_nativert_with_aoti_delegate,
+)
+from torch.testing._internal.common_utils import IS_WINDOWS
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 from torch.utils import _pytree as pytree
 
 
@@ -185,6 +195,153 @@ def make_dynamic_cls(cls, strict=False):
     test_class.__module__ = __name__
 
 
+@unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
+@unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
+class TestNativeRT(TestCase):
+    @staticmethod
+    def get_module():
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.linear(x))
+
+        return M()
+
+    @staticmethod
+    def get_module_multi_output():
+        class MMultiOutput(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return (self.relu(self.linear(x)), x)
+
+        return MMultiOutput()
+
+    @staticmethod
+    def get_model_pytree():
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear1 = torch.nn.Linear(4, 4)
+                self.linear2 = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                x1, (x2, x3) = x
+                y1 = self.linear1(x1)
+                y2 = self.linear2(x2)
+                y3 = self.linear2(x3)
+                return (y1, (y2, y3))
+
+        return M()
+
+    parameters = []
+    for device in ["cpu", "cuda"]:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
+            continue
+        for module, sample_inputs in [
+            (get_module.__func__().to(device), (torch.randn(4, 4).to(device),)),
+            (
+                get_module_multi_output.__func__().to(device),
+                (torch.randn(4, 4).to(device),),
+            ),
+            (
+                get_model_pytree.__func__().to(device),
+                (
+                    (
+                        torch.randn(4, 4).to(device),
+                        (
+                            torch.randn(4, 4).to(device),
+                            torch.randn(4, 4).to(device),
+                        ),
+                    ),
+                ),
+            ),
+        ]:
+            parameters.append(
+                (
+                    device,
+                    module,
+                    sample_inputs,
+                )
+            )
+
+    @parameterized.expand(parameters)
+    def test_aoti(self, device, m, sample_inputs):
+        MODEL_NAME = "model"
+        BACKEND_ID = "aoti"
+
+        # get the original EP
+        original_ep = torch.export.export(m, sample_inputs)
+
+        aoti_delegate_ep, aoti_files = lower_exported_program(
+            original_ep, MODEL_NAME, BACKEND_ID
+        )
+
+        # package everything needed for the NativeRT to execute the AOTI delegate
+        with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
+            package_nativert_with_aoti_delegate(
+                f,
+                MODEL_NAME,
+                BACKEND_ID,
+                original_ep,
+                aoti_delegate_ep,
+                aoti_files,
+            )
+            filename = f.name
+
+        try:
+            ep_args, ep_kwargs = aoti_delegate_ep.example_inputs
+            ep_args_copied, ep_kwargs_copied = (
+                copy.deepcopy(ep_args),
+                copy.deepcopy(ep_kwargs),
+            )
+            torch.manual_seed(0)
+            try:
+                flat_expected = pytree.tree_leaves(
+                    aoti_delegate_ep.module()(*ep_args_copied, **ep_kwargs_copied)
+                )
+            except Exception as e:
+                raise unittest.case.SkipTest(str(e)) from e
+
+            model_runner = PyModelRunner(filename, f"{MODEL_NAME}-{BACKEND_ID}")
+            torch.manual_seed(0)
+            if _is_supported_types((ep_args, ep_kwargs)):
+                results = model_runner.run(*ep_args, **ep_kwargs)
+            else:
+                results = model_runner.run_with_flat_inputs_and_outputs(
+                    *pytree.tree_leaves((ep_args, ep_kwargs))
+                )
+            flat_results = pytree.tree_leaves(results)
+            assert len(flat_results) == len(flat_expected)
+            for result, expected in zip(flat_results, flat_expected):
+                assert type(result) == type(expected)
+                if isinstance(result, torch.Tensor) and isinstance(
+                    expected, torch.Tensor
+                ):
+                    assert result.shape == expected.shape
+                    assert result.dtype == expected.dtype
+                    assert result.device == expected.device
+                    torch.testing.assert_close(result, expected, equal_nan=True)
+                else:
+                    assert result == expected
+        except RuntimeError as e:
+            # User need to register pytree type on the cpp side, which
+            # cannot be tested in python unittest.
+            if "Unknown pytree node type" in str(e):
+                pass
+            else:
+                raise e
+        finally:
+            pathlib.Path(filename).unlink(missing_ok=True)
+
+
 tests = [
     test_export.TestExport,
 ]
@@ -196,4 +353,6 @@ def make_dynamic_cls(cls, strict=False):
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
-    run_tests()
+    # nativert has not been supported on XPU yet.
+    if not torch.xpu.is_available():
+        run_tests()
diff --git a/test/export/test_passes.py b/test/export/test_passes.py
index 4ae4d45498e93..e93a66ed572b5 100644
--- a/test/export/test_passes.py
+++ b/test/export/test_passes.py
@@ -14,6 +14,7 @@
 import torch
 from functorch.experimental.control_flow import cond
 from torch._dynamo.eval_frame import is_dynamo_supported
+from torch._export import config
 from torch._export.non_strict_utils import (
     _fakify_script_objects,
     _gather_constant_attrs,
@@ -179,7 +180,8 @@ def forward(self, x):
 
     def _get_predispatch_module(mod, args, ambient_grad_enabled=True):
         with torch.set_grad_enabled(ambient_grad_enabled):
-            return _export(mod, args, pre_dispatch=True).module()
+            with config.patch(use_new_tracer_experimental=True):
+                return _export(mod, args, pre_dispatch=True).module()
 
     return {
         "ctx_manager": (
@@ -307,7 +309,9 @@ def forward(self, x):
     x = torch.randn(2, 2)
 
     def _get_predispatch_module(mod, args):
-        return _export(mod, args, pre_dispatch=True).module()
+        with torch._export.config.patch(use_new_tracer_experimental=True):
+            ep = _export(mod, args, pre_dispatch=True).module()
+            return ep
 
     return {
         "multi_ctx_manager": (
@@ -835,13 +839,14 @@ def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
-    sin = torch.ops.aten.sin.default(add);  add = None
-    sum_1 = torch.ops.aten.sum.default(sin);  sin = None
-    submod_4 = self.submod_2
-    add_1 = torch.ops.higher_order.wrap_with_set_grad_enabled(False, submod_4, sum_1);  submod_4 = sum_1 = None
-    getitem = add_1[0];  add_1 = None
-    sub = torch.ops.aten.sub.Tensor(getitem, 1)
-    return pytree.tree_unflatten((getitem, sub), self._out_spec)
+    submod_4 = self.submod_1
+    sum_1 = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_4, add);  submod_4 = add = None
+    getitem = sum_1[0];  sum_1 = None
+    add_1 = torch.ops.aten.add.Tensor(getitem, 1);  getitem = None
+    submod_5 = self.submod_3
+    sub = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_5, add_1);  submod_5 = None
+    getitem_1 = sub[0];  sub = None
+    return pytree.tree_unflatten((add_1, getitem_1), self._out_spec)
     """,
         )
 
@@ -1013,28 +1018,29 @@ def test_predispatch_autocast_and_set_grad(self):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     _guards_fn = self._guards_fn(x);  _guards_fn = None
-    submod_3 = self.submod_3
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
-    sin = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_3, add);  submod_3 = add = None
-    getitem_2 = sin[0];  sin = None
-    cos = torch.ops.aten.cos.default(getitem_2);  getitem_2 = None
-    submod_4 = self.submod_1
-    add_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, False, None, submod_4, cos);  submod_4 = cos = None
-    getitem = add_1[0];  add_1 = None
-    sub = torch.ops.aten.sub.Tensor(getitem, 1)
-    return pytree.tree_unflatten((getitem, sub), self._out_spec)
+    sin = torch.ops.aten.sin.default(add);  add = None
+    submod_3 = self.submod_2
+    wrap_with_set_grad_enabled = torch.ops.higher_order.wrap_with_set_grad_enabled(False, submod_3, sin);  submod_3 = sin = None
+    add_1 = wrap_with_set_grad_enabled[0]
+    sub = wrap_with_set_grad_enabled[1];  wrap_with_set_grad_enabled = None
+    return pytree.tree_unflatten((add_1, sub), self._out_spec)
     """,
         )
         self.assertExpectedInline(
-            mod.submod_3.code.strip("\n"),
+            mod.submod_2.code.strip("\n"),
             """\
-def forward(self, add):
-    sin = torch.ops.aten.sin.default(add);  add = None
-    return (sin,)
+def forward(self, sin):
+    cos = torch.ops.aten.cos.default(sin);  sin = None
+    submod_3 = self.submod_1
+    add_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, False, None, submod_3, cos);  submod_3 = cos = None
+    getitem = add_1[0];  add_1 = None
+    sub = torch.ops.aten.sub.Tensor(getitem, 1)
+    return (getitem, sub)
     """,
         )
         self.assertExpectedInline(
-            mod.submod_1.code.strip("\n"),
+            mod.submod_2.submod_1.code.strip("\n"),
             """\
 def forward(self, cos):
     add_1 = torch.ops.aten.add.Tensor(cos, 1);  cos = None
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index ebc6e6d0672ea..8c8222d1b917c 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -45,7 +45,7 @@
 )
 from torch._higher_order_ops.torchbind import enable_torchbind_tracing
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
-from torch.export import Dim, export_for_training, load, save, unflatten
+from torch.export import Dim, export, load, save, unflatten
 from torch.export.pt2_archive.constants import ARCHIVE_VERSION_PATH
 from torch.fx.experimental.symbolic_shapes import is_concrete_int, ValueRanges
 from torch.testing._internal.common_utils import (
@@ -115,7 +115,7 @@ def op_schema(cls, op):
                 return torch.ops.aten.add.Tensor._schema
 
         inp = (torch.ones(10),)
-        ep = export_for_training(TestModule(), inp, strict=True)
+        ep = export(TestModule(), inp, strict=True)
 
         # Register the custom op handler.
         foo_custom_op = FooExtensionOp()
@@ -180,9 +180,7 @@ def forward(self, x, y, use_p=False):
 
         model = MyModule().eval()
         random_inputs = (torch.rand([2, 3]), torch.rand([2, 3]))
-        exp_program = export_for_training(
-            model, random_inputs, {"use_p": True}, strict=True
-        )
+        exp_program = export(model, random_inputs, {"use_p": True}, strict=True)
 
         output_buffer = io.BytesIO()
         # Tests that example inputs are preserved when saving and loading module.
@@ -201,7 +199,7 @@ class M(torch.nn.Module):
             def forward(self, x):
                 return x.sin()
 
-        exp_program = export_for_training(M(), (torch.randn(4, 4),), strict=True)
+        exp_program = export(M(), (torch.randn(4, 4),), strict=True)
 
         output_buffer = io.BytesIO()
         # Tests that example forward arg names are preserved when saving and loading module.
@@ -241,7 +239,7 @@ def forward(self, x):
         inp = (torch.ones(10),)
         # Module will only be able to roundtrip if metadata
         # can be correctly parsed.
-        ep = export_for_training(MyModule(), inp, strict=True)
+        ep = export(MyModule(), inp, strict=True)
         buffer = io.BytesIO()
         save(ep, buffer)
         loaded_ep = load(buffer)
@@ -282,7 +280,7 @@ def forward(self, x):
                 return h + out_c
 
         inp = (torch.ones(10),)
-        ep = export_for_training(Foo(), inp, strict=True)
+        ep = export(Foo(), inp, strict=True)
         buffer = io.BytesIO()
         save(ep, buffer)
         loaded_ep = load(buffer)
@@ -324,7 +322,7 @@ def forward(self, x):
 
         # Check that module can be roundtripped, thereby confirming proper deserialization.
         inp = (torch.ones(10),)
-        ep = export_for_training(MyModule(), inp, strict=True)
+        ep = export(MyModule(), inp, strict=True)
         buffer = io.BytesIO()
         save(ep, buffer)
         loaded_ep = load(buffer)
@@ -347,7 +345,7 @@ def forward(self, x, w, b):
                     eps=1e-5,
                 )
 
-        exported_module = export_for_training(
+        exported_module = export(
             MyModule(),
             (
                 torch.ones([512, 512], requires_grad=True),
@@ -391,7 +389,7 @@ def forward(self, a, b, c) -> torch.Tensor:
             "b": {1: dim1_bc},
             "c": {0: dim0_ac, 1: dim1_bc},
         }
-        exported_module = export_for_training(
+        exported_module = export(
             DynamicShapeSimpleModel(),
             inputs,
             dynamic_shapes=dynamic_shapes,
@@ -455,7 +453,7 @@ def forward(self, a, b, c) -> torch.Tensor:
             "b": {1: dim1_bc},
             "c": {0: dim0_ac, 1: dim1_bc},
         }
-        exported_module = export_for_training(
+        exported_module = export(
             DynamicShapeSimpleModel(),
             inputs,
             dynamic_shapes=dynamic_shapes,
@@ -485,9 +483,7 @@ def forward(self, x):
                 return torch.split(x, 2)
 
         input = torch.arange(10.0).reshape(5, 2)
-        exported_module = export_for_training(
-            MyModule(), (input,), strict=True
-        ).run_decompositions()
+        exported_module = export(MyModule(), (input,), strict=True).run_decompositions()
 
         serialized = ExportedProgramSerializer().serialize(exported_module)
         node = serialized.exported_program.graph_module.graph.nodes[-1]
@@ -550,7 +546,7 @@ def __init__(self) -> None:
             def forward(self, x):
                 return torch.ops.aten.var_mean.correction(x, [1])[0]
 
-        exported_module = export_for_training(
+        exported_module = export(
             MyModule(), (torch.ones([512, 512], requires_grad=True),), strict=True
         ).run_decompositions()
 
@@ -571,7 +567,7 @@ class M(torch.nn.Module):
             def forward(self, x):
                 return x + x
 
-        ep = export_for_training(
+        ep = export(
             M(), (torch.randn(4),), dynamic_shapes=({0: Dim("temp")},), strict=True
         )
 
@@ -673,7 +669,7 @@ def forward(self, x, y):
                     kwargs.append(arg.arg)
 
             self.assertEqual(len(args), 4)
-            self.assertEqual(len(kwargs), 4)
+            self.assertEqual(len(kwargs), 5)
 
             for i in range(3):
                 self.assertIsNotNone(args[i].as_tensor)
@@ -686,6 +682,7 @@ def forward(self, x, y):
             self.assertEqual(
                 kwargs[3].as_int, 8 if isinstance(m, MyModelAutotune) else 4
             )  # num warps
+            self.assertEqual(kwargs[4].as_int, 0)  # shared mem bytes
 
             self.assertEqual(len(triton_node.outputs), 1)
             self.assertIsNotNone(triton_node.outputs[0].as_tensors)
@@ -719,7 +716,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         f = Foo()
 
         x, _ = torch.sort(torch.randn(3, 4))
-        exported_module = export_for_training(f, (x,), strict=True).run_decompositions()
+        exported_module = export(f, (x,), strict=True).run_decompositions()
         serialized = ExportedProgramSerializer().serialize(exported_module)
 
         node = serialized.exported_program.graph_module.graph.nodes[-1]
@@ -737,9 +734,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 b = x + y
                 return b + a
 
-        ep = export_for_training(
-            Module(), (torch.randn(3, 2), torch.randn(3, 2)), strict=True
-        )
+        ep = export(Module(), (torch.randn(3, 2), torch.randn(3, 2)), strict=True)
         s = ExportedProgramSerializer().serialize(ep)
         c = canonicalize(s.exported_program)
         g = c.graph_module.graph
@@ -753,7 +748,7 @@ class M(torch.nn.Module):
             def forward(self, x):
                 return torch.ops.aten.sum.dim_IntList(x, [])
 
-        ep = torch.export.export_for_training(M(), (torch.randn(3, 2),), strict=True)
+        ep = torch.export.export(M(), (torch.randn(3, 2),), strict=True)
         serialized = ExportedProgramSerializer().serialize(ep)
         for node in serialized.exported_program.graph_module.graph.nodes:
             if "aten.sum.dim_IntList" in node.target:
@@ -893,6 +888,42 @@ def forward(self, x):
         loaded_ep = load(buffer)
         self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
 
+    def test_1D_tensor_slicing(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.arange(8)[::2]
+
+            def forward(self, x):
+                return x + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(4),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
+    def test_2D_tensor_slicing(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.randn(4, 4)[:2, :2]
+
+            def forward(self, x):
+                return x + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(2, 2),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
     def test_complex_constant(self) -> None:
         class M(torch.nn.Module):
             def forward(self, x):
@@ -1023,7 +1054,7 @@ def _deepcopy_inputs(inputs):
 
         def _check_graph(pre_dispatch):
             if pre_dispatch:
-                ep = torch.export.export_for_training(
+                ep = torch.export.export(
                     fn,
                     _deepcopy_inputs(inputs),
                     {},
@@ -1573,7 +1604,7 @@ def forward(self, x):
                 a = a * 2
                 return a, b
 
-        ep = torch.export.export_for_training(M(), (torch.ones(3),), strict=True)
+        ep = torch.export.export(M(), (torch.ones(3),), strict=True)
 
         # insert another getitem node
         for node in ep.graph.nodes:
@@ -1617,7 +1648,8 @@ def forward(self, x):
     def test_exportdb_supported(self, name: str, case: ExportCase) -> None:
         model = case.model
         _check_meta = "map" not in name
-        self.check_graph(model, case.example_args, _check_meta=_check_meta)
+        with torch._export.config.patch(use_new_tracer_experimental=True):
+            self.check_graph(model, case.example_args, _check_meta=_check_meta)
 
     def test_constraints(self):
         class Module(torch.nn.Module):
@@ -1719,7 +1751,7 @@ def __init__(self) -> None:
             def forward(self):
                 return self.p * self.p
 
-        ep = torch.export.export_for_training(M(), (), strict=True)
+        ep = torch.export.export(M(), (), strict=True)
         ep._example_inputs = None
         roundtrip_ep = deserialize(serialize(ep))
         self.assertTrue(torch.allclose(ep.module()(), roundtrip_ep.module()()))
@@ -1761,7 +1793,7 @@ def forward(self, x):
                 return x + x
 
         f = Module()
-        ep = export_for_training(f, (torch.randn(1, 3),), strict=True)
+        ep = export(f, (torch.randn(1, 3),), strict=True)
 
         serialized_program = ExportedProgramSerializer().serialize(ep)
         serialized_program.exported_program.schema_version.major = -1
@@ -1797,7 +1829,7 @@ def forward(self, x):
                 y = self.linear(y)
                 return y
 
-        ep = export_for_training(Module(), inp, strict=True)
+        ep = export(Module(), inp, strict=True)
 
         buffer = io.BytesIO()
         save(ep, buffer)
@@ -1815,7 +1847,7 @@ def forward(self, x):
         f = Foo()
 
         inp = (torch.randn(2, 2),)
-        ep = export_for_training(f, inp, strict=True)
+        ep = export(f, inp, strict=True)
 
         with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
             save(ep, f.name)
@@ -1832,7 +1864,7 @@ def forward(self, x, y):
         f = Foo()
 
         inp = (torch.tensor([6]), torch.tensor([7]))
-        ep = export_for_training(f, inp, strict=True)
+        ep = export(f, inp, strict=True)
 
         with TemporaryFileName(suffix=".pt2") as fname:
             path = Path(fname)
@@ -1850,7 +1882,7 @@ def forward(self, x):
 
         f = Foo()
 
-        ep = export_for_training(f, inp, strict=True)
+        ep = export(f, inp, strict=True)
 
         buffer = io.BytesIO()
         save(ep, buffer, extra_files={"extra.txt": "moo"})
@@ -1871,7 +1903,7 @@ def forward(self, x):
 
         f = Foo()
 
-        ep = export_for_training(f, (torch.randn(1, 3),), strict=True)
+        ep = export(f, (torch.randn(1, 3),), strict=True)
 
         with self.assertRaisesRegex(
             ValueError, r"Saved archive version -1 does not match our current"
@@ -1907,7 +1939,7 @@ def forward(self, x):
                 list_tensor = [torch.tensor(3), torch.tensor(4)]
                 return x + self.a + list_tensor[0] + list_tensor[1]
 
-        ep = export_for_training(Foo(), (torch.tensor(1),), strict=True)
+        ep = export(Foo(), (torch.tensor(1),), strict=True)
         buffer = io.BytesIO()
         save(ep, buffer)
         buffer.seek(0)
@@ -1933,7 +1965,7 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.zeros(4, 4),)
-        ep = export_for_training(f, inputs, strict=True)
+        ep = export(f, inputs, strict=True)
 
         # Replace one of the values with an instance of our custom class
         for node in ep.graph.nodes:
@@ -1987,7 +2019,7 @@ def forward(self, x):
 
         inputs = (torch.zeros(2, 3),)
         with enable_torchbind_tracing():
-            ep = export_for_training(f, inputs, strict=False)
+            ep = export(f, inputs, strict=False)
 
         serialized_vals = serialize(ep)
         ep = deserialize(serialized_vals)
@@ -2007,7 +2039,7 @@ def forward(self, x):
 
         inputs = (torch.zeros(2, 3),)
         with enable_torchbind_tracing():
-            ep = export_for_training(f, inputs, strict=False)
+            ep = export(f, inputs, strict=False)
 
         serialized_vals = serialize(ep)
         ep = deserialize(serialized_vals)
@@ -2042,7 +2074,7 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.zeros(4, 4),)
-        ep = export_for_training(f, inputs, strict=True)
+        ep = export(f, inputs, strict=True)
 
         new_gm = copy.deepcopy(ep.graph_module)
         new_gm.meta["custom"] = {}
@@ -2077,7 +2109,7 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.ones(2, 2),)
-        ep = export_for_training(f, inputs, strict=True)
+        ep = export(f, inputs, strict=True)
 
         new_gm = copy.deepcopy(ep.graph_module)
         new_gm.meta["custom"] = {}
@@ -2113,7 +2145,7 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.zeros(4, 4),)
-        ep = export_for_training(f, inputs, strict=True)
+        ep = export(f, inputs, strict=True)
 
         new_gm = copy.deepcopy(ep.graph_module)
         new_gm.meta["custom"] = {}
diff --git a/test/export/test_strict_export_v2.py b/test/export/test_strict_export_v2.py
new file mode 100644
index 0000000000000..53935d70bb49c
--- /dev/null
+++ b/test/export/test_strict_export_v2.py
@@ -0,0 +1,53 @@
+# Owner(s): ["oncall: export"]
+
+try:
+    from . import test_export, testing
+except ImportError:
+    import test_export  # @manual=fbcode//caffe2/test:test_export-library
+    import testing  # @manual=fbcode//caffe2/test:test_export-library
+
+from torch._export import config
+from torch.export import export
+
+
+test_classes = {}
+
+
+def mocked_strict_export_v2(*args, **kwargs):
+    # If user already specified strict, don't make it strict
+    with config.patch(use_new_tracer_experimental=True):
+        if "strict" in kwargs:
+            return export(*args, **kwargs)
+        return export(*args, **kwargs, strict=True)
+
+
+def make_dynamic_cls(cls):
+    cls_prefix = "StrictExportV2"
+
+    test_class = testing.make_test_cls_with_mocked_export(
+        cls,
+        cls_prefix,
+        test_export.STRICT_EXPORT_V2_SUFFIX,
+        mocked_strict_export_v2,
+        xfail_prop="_expected_failure_strict_v2",
+    )
+
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    globals()[test_class.__name__] = test_class
+    test_class.__module__ = __name__
+    return test_class
+
+
+tests = [
+    test_export.TestDynamismExpression,
+    test_export.TestExport,
+]
+for test in tests:
+    make_dynamic_cls(test)
+del test
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/export/test_swap.py b/test/export/test_swap.py
index d9b2269dc3243..c25d11017f71d 100644
--- a/test/export/test_swap.py
+++ b/test/export/test_swap.py
@@ -9,8 +9,9 @@
 import torch
 import torch._dynamo as torchdynamo
 from torch import Tensor
+from torch._export import config
 from torch._export.utils import register_dataclass_as_pytree_node
-from torch.export import export
+from torch.export import export, register_dataclass
 from torch.export._swap import _swap_modules
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
 
@@ -368,7 +369,7 @@ class CustomInput:
             a: Tensor
             b: Tensor
 
-        register_dataclass_as_pytree_node(
+        register_dataclass(
             CustomInput,
             serialized_type_name="test_swap.test_custom_input.CustomInput",
         )
@@ -377,12 +378,14 @@ class Foo(torch.nn.Module):
             def forward(self, x, *, inputs):
                 return x + torch.matmul(inputs.a, inputs.b)
 
-        ep = export(
-            Foo(),
-            (torch.randn(2, 2),),
-            {"inputs": CustomInput(torch.randn(2, 3), torch.randn(3, 2))},
-            strict=self.strict,
-        )
+        for use_new_tracer in [True, False]:
+            with config.patch(use_new_tracer_experimental=use_new_tracer):
+                ep = export(
+                    Foo(),
+                    (torch.randn(2, 2),),
+                    {"inputs": CustomInput(torch.randn(2, 3), torch.randn(3, 2))},
+                    strict=self.strict,
+                )
         swapped = _swap_modules(ep, {})
         inp_args = (torch.randn(2, 2),)
         inp_kwargs = {"inputs": CustomInput(torch.randn(2, 3), torch.randn(3, 2))}
@@ -390,6 +393,30 @@ def forward(self, x, *, inputs):
         res2 = swapped(*inp_args, **inp_kwargs)
         self.assertTrue(torch.allclose(res1, res2))
 
+    def test_custom_input_kwargs_use_private(self):
+        @dataclass
+        class CustomInput:
+            a: Tensor
+            b: Tensor
+
+        register_dataclass_as_pytree_node(
+            CustomInput,
+            serialized_type_name="test_swap.test_custom_input.CustomInput",
+        )
+
+        class Foo(torch.nn.Module):
+            def forward(self, x, *, inputs):
+                return x + torch.matmul(inputs.a, inputs.b)
+
+        # shouldn't error
+        with config.patch(use_new_tracer_experimental=True):
+            _ = export(
+                Foo(),
+                (torch.randn(2, 2),),
+                {"inputs": CustomInput(torch.randn(2, 3), torch.randn(3, 2))},
+                strict=self.strict,
+            )
+
     def test_custom_output(self):
         @dataclass
         class CustomOutput:
diff --git a/test/export/test_torchbind.py b/test/export/test_torchbind.py
index f45775f09f29a..3e8d8e35ada1a 100644
--- a/test/export/test_torchbind.py
+++ b/test/export/test_torchbind.py
@@ -138,7 +138,7 @@ def _test_export_same_as_eager(
         def export_wrapper(f, args, kwargs, strict, pre_dispatch):
             with enable_torchbind_tracing():
                 if pre_dispatch:
-                    exported_program = torch.export.export_for_training(
+                    exported_program = torch.export.export(
                         f, args, kwargs, strict=strict
                     ).run_decompositions({})
                 else:
@@ -755,7 +755,7 @@ def forward(self, tq, x):
         b = torch.randn(2, 2)
         tq.push(a)
         tq.push(b)
-        ep = torch.export.export_for_training(
+        ep = torch.export.export(
             mod, (tq, torch.randn(2, 2)), strict=False
         ).run_decompositions({})
         self.assertExpectedInline(
@@ -809,9 +809,9 @@ def forward(self, L_safe_obj_ : torch.ScriptObject):
         )
 
         with enable_torchbind_tracing():
-            ep = torch.export.export_for_training(
-                mod, (safe_obj,), strict=False
-            ).run_decompositions({})
+            ep = torch.export.export(mod, (safe_obj,), strict=False).run_decompositions(
+                {}
+            )
             self.assertExpectedInline(
                 ep.graph_module.code.strip(),
                 """\
@@ -1407,9 +1407,9 @@ def forward(self, obj, x):
         x = torch.randn(3, 1)
         eager_out = mod(test_obj, x)
         compiled_out = torch.compile(mod, backend=backend, fullgraph=True)(test_obj, x)
-        ep = torch.export.export_for_training(
-            mod, (test_obj, x), strict=False
-        ).run_decompositions({})
+        ep = torch.export.export(mod, (test_obj, x), strict=False).run_decompositions(
+            {}
+        )
         self.assertExpectedInline(
             ep.graph_module.code.strip(),
             """\
diff --git a/test/export/test_unflatten_training_ir.py b/test/export/test_unflatten_training_ir.py
index 6816787eff224..677b427f37545 100644
--- a/test/export/test_unflatten_training_ir.py
+++ b/test/export/test_unflatten_training_ir.py
@@ -7,14 +7,14 @@
     import test_unflatten  # @manual=fbcode//caffe2/test:test_export-library
     import testing  # @manual=fbcode//caffe2/test:test_export-library
 
-from torch.export import export_for_training
+from torch.export import export
 
 
 test_classes = {}
 
 
 def mocked_training_ir_export(*args, **kwargs):
-    return export_for_training(*args, **kwargs, strict=True)
+    return export(*args, **kwargs, strict=True)
 
 
 def make_dynamic_cls(cls):
diff --git a/test/export/test_verifier.py b/test/export/test_verifier.py
index 5d3cfd5646377..f6e49791edf84 100644
--- a/test/export/test_verifier.py
+++ b/test/export/test_verifier.py
@@ -6,7 +6,7 @@
 from torch import Tensor
 from torch._dynamo.eval_frame import is_dynamo_supported
 from torch._export.verifier import SpecViolationError, Verifier
-from torch.export import export_for_training
+from torch.export import export
 from torch.export.exported_program import InputKind, InputSpec, TensorArgument
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
 
@@ -20,7 +20,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
-        ep = export_for_training(f, (torch.randn(100), torch.randn(100)), strict=True)
+        ep = export(f, (torch.randn(100), torch.randn(100)), strict=True)
 
         verifier = Verifier()
         verifier.check(ep)
@@ -47,7 +47,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
-        ep = export_for_training(
+        ep = export(
             f, (torch.randn(100), torch.randn(100)), strict=True
         ).run_decompositions({})
         for node in ep.graph.nodes:
@@ -72,7 +72,7 @@ def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
-        ep = export_for_training(f, (torch.randn(3, 3), torch.randn(3, 3)), strict=True)
+        ep = export(f, (torch.randn(3, 3), torch.randn(3, 3)), strict=True)
 
         verifier = Verifier()
         verifier.check(ep)
@@ -91,7 +91,7 @@ def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
-        ep = export_for_training(
+        ep = export(
             f, (torch.randn(3, 3), torch.randn(3, 3)), strict=True
         ).run_decompositions({})
         for node in ep.graph_module.true_graph_0.graph.nodes:
@@ -111,7 +111,7 @@ def __init__(self) -> None:
             def forward(self, x: Tensor) -> Tensor:
                 return self.linear(x)
 
-        ep = export_for_training(M(), (torch.randn(10, 10),), strict=True)
+        ep = export(M(), (torch.randn(10, 10),), strict=True)
         ep.validate()
 
     def test_ep_verifier_invalid_param(self) -> None:
@@ -125,7 +125,7 @@ def __init__(self) -> None:
             def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 return x + y + self.a
 
-        ep = export_for_training(M(), (torch.randn(100), torch.randn(100)), strict=True)
+        ep = export(M(), (torch.randn(100), torch.randn(100)), strict=True)
 
         # Parameter doesn't exist in the state dict
         ep.graph_signature.input_specs[0] = InputSpec(
@@ -150,7 +150,7 @@ def __init__(self) -> None:
             def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 return x + y + self.a
 
-        ep = export_for_training(M(), (torch.randn(100), torch.randn(100)), strict=True)
+        ep = export(M(), (torch.randn(100), torch.randn(100)), strict=True)
 
         # Buffer doesn't exist in the state dict
         ep.graph_signature.input_specs[0] = InputSpec(
@@ -182,9 +182,7 @@ def forward(self, x1, x2):
                 self.my_buffer2.add_(1.0)
                 return output
 
-        ep = export_for_training(
-            M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True
-        )
+        ep = export(M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True)
         ep.validate()
 
     def test_ep_verifier_invalid_output(self) -> None:
@@ -207,9 +205,7 @@ def forward(self, x1, x2):
                 self.my_buffer2.add_(1.0)
                 return output
 
-        ep = export_for_training(
-            M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True
-        )
+        ep = export(M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True)
 
         output_node = list(ep.graph.nodes)[-1]
         output_node.args = (
diff --git a/test/export/testing.py b/test/export/testing.py
index 7ff198d72e780..cfa29cf693dea 100644
--- a/test/export/testing.py
+++ b/test/export/testing.py
@@ -257,6 +257,12 @@ def expectedFailureTrainingIRToRunDecompNonStrict(fn):
     return fn
 
 
+# Controls tests generated in test/export/test_export_strict_v2.py
+def expectedFailureStrictV2(fn):
+    fn._expected_failure_strict_v2 = True
+    return fn
+
+
 # Controls tests generated in test/export/test_export_strict.py
 def expectedFailureStrict(fn):
     fn._expected_failure_strict = True
diff --git a/test/functorch/attn_ft.py b/test/functorch/attn_ft.py
index 7038ded094904..c5130e5f8a264 100644
--- a/test/functorch/attn_ft.py
+++ b/test/functorch/attn_ft.py
@@ -6,7 +6,7 @@
 import math
 
 import torch
-from functorch.dim import cat, dimlists, dims, softmax
+from functorch.dim import cat, dimlists, dims
 from torch import nn
 
 
@@ -142,7 +142,7 @@ def forward(
 
         attention_probs = attention_scores
         # Normalize the attention scores to probabilities.
-        attention_probs = softmax(attention_scores, dim=key_sequence)
+        attention_probs = torch.softmax(attention_scores, dim=key_sequence)
         # # This is actually dropping out entire tokens to attend to, which might
         # # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = torch.nn.functional.dropout(
diff --git a/test/functorch/dim/test_getsetitem.py b/test/functorch/dim/test_getsetitem.py
new file mode 100644
index 0000000000000..ae7ed0283c753
--- /dev/null
+++ b/test/functorch/dim/test_getsetitem.py
@@ -0,0 +1,265 @@
+# Owner(s): ["module: functorch"]
+import torch
+from functorch.dim import Dim, DimList, dims, Tensor
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestGetSetItem(TestCase):
+    """Comprehensive tests for first-class dimension indexing operations."""
+
+    def setUp(self):
+        """Set up common test fixtures."""
+        self.batch, self.height, self.width = dims(3)
+
+    def test_basic_dim_indexing(self):
+        """Test basic indexing with a single Dim."""
+        tensor = torch.randn(3, 4, 5)
+        x, y, z = dims(3)
+
+        # Test indexing with each dim
+        result1 = tensor[x]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[y]
+        self.assertIsInstance(result2, Tensor)
+
+        result3 = tensor[z]
+        self.assertIsInstance(result3, Tensor)
+
+    def test_multiple_dim_indexing(self):
+        """Test indexing with multiple Dims."""
+        tensor = torch.randn(3, 4, 5)
+        x, y, z = dims(3)
+
+        # Test multiple dims in one indexing operation
+        result = tensor[x, y]
+        self.assertIsInstance(result, Tensor)
+
+        result = tensor[x, y, z]
+        self.assertIsInstance(result, Tensor)
+
+    def test_mixed_indexing(self):
+        """Test mixing Dims with regular indexing."""
+        tensor = torch.randn(3, 4, 5)
+        x, y, z = dims(3)
+
+        # Mix dim with slice
+        result1 = tensor[x, :]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[:, y]
+        self.assertIsInstance(result2, Tensor)
+
+        # Mix dim with integer
+        result3 = tensor[x, 0]
+        self.assertIsInstance(result3, Tensor)
+
+        result4 = tensor[0, y]
+        self.assertIsInstance(result4, Tensor)
+
+    def test_ellipsis_indexing(self):
+        """Test indexing with ellipsis (...)."""
+        tensor = torch.randn(3, 4, 5, 6)
+        x, y, z, w = dims(4)
+
+        # Test ellipsis with dims
+        result1 = tensor[x, ...]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[..., y]
+        self.assertIsInstance(result2, Tensor)
+
+        result3 = tensor[x, ..., y]
+        self.assertIsInstance(result3, Tensor)
+
+    def test_none_indexing(self):
+        """Test indexing with None (newaxis)."""
+        tensor = torch.randn(3, 4)
+        x, y = dims(2)
+
+        # Test None with dims
+        result1 = tensor[x, None, y]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[None, x]
+        self.assertIsInstance(result2, Tensor)
+
+    def test_slice_indexing(self):
+        """Test indexing with slices mixed with dims."""
+        tensor = torch.randn(6, 8, 10)
+        x, y, z = dims(3)
+
+        # Test various slice patterns with dims
+        result1 = tensor[x, 1:5]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[1:3, y]
+        self.assertIsInstance(result2, Tensor)
+
+        result3 = tensor[x, 1:5, z]
+        self.assertIsInstance(result3, Tensor)
+
+    def test_tensor_indexing(self):
+        """Test indexing with tensor indices."""
+        tensor = torch.randn(5, 6, 7)
+        x, y, z = dims(3)
+
+        # Create index tensors
+        idx = torch.tensor([0, 2, 4])
+
+        # Test tensor indexing with dims
+        result1 = tensor[x, idx]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[idx, y]
+        self.assertIsInstance(result2, Tensor)
+
+    def test_boolean_indexing(self):
+        """Test boolean indexing with dims."""
+        tensor = torch.randn(4, 5)
+        x, y = dims(2)
+
+        # Create boolean mask
+        mask = torch.tensor([True, False, True, False, True])
+
+        # Test boolean indexing
+        result = tensor[x, mask]
+        self.assertIsInstance(result, Tensor)
+
+    def test_dim_pack_indexing(self):
+        """Test indexing with dimension packs (tuples/lists of dims)."""
+        tensor = torch.randn(3, 4)  # Need 2D tensor for 2 dims
+
+        # Create dims for dim pack
+        a, b = dims(2)
+
+        # Test dim pack indexing - using separate dimensions
+        result = tensor[a, b]
+        self.assertIsInstance(result, Tensor)
+
+    def test_unbound_dim_binding(self):
+        """Test automatic binding of unbound dimensions during indexing."""
+        tensor = torch.randn(6, 8)
+        x = Dim("x")  # unbound
+        y = Dim("y")  # unbound
+
+        # Should automatically bind dimensions
+        result = tensor[x, y]
+        self.assertIsInstance(result, Tensor)
+        self.assertEqual(x.size, 6)
+        self.assertEqual(y.size, 8)
+
+    def test_dimlist_indexing(self):
+        """Test indexing with DimList objects."""
+        tensor = torch.randn(3, 4, 5)
+
+        # Create a bound dimlist
+        dl = DimList(dims(2))
+
+        # Test dimlist indexing
+        result = tensor[dl, :]
+        self.assertIsInstance(result, Tensor)
+
+    def test_unbound_dimlist_indexing(self):
+        """Test indexing with unbound DimList."""
+        tensor = torch.randn(3, 4, 5)
+
+        # Create unbound dimlist
+        dl = DimList()
+
+        # Should bind to remaining dimensions
+        result = tensor[0, dl]
+        self.assertIsInstance(result, Tensor)
+
+    def test_repeated_dim_usage(self):
+        """Test using the same dim multiple times in indexing."""
+        tensor = torch.randn(4, 4, 4)
+        x, y, z = dims(3)
+
+        # This should trigger advanced indexing for repeated dims
+        result = tensor[x, x]
+        self.assertIsInstance(result, Tensor)
+
+    def test_complex_mixed_indexing(self):
+        """Test complex combinations of different indexing types."""
+        tensor = torch.randn(3, 4, 5, 6, 7)
+        a, b, c, d, e = dims(5)
+
+        # Complex mixed indexing
+        idx = torch.tensor([0, 2])
+
+        result1 = tensor[a, 1:3, None, idx, :]
+        self.assertIsInstance(result1, Tensor)
+
+        # Use mask with correct shape
+        correct_mask = torch.tensor([True, False, True, False, False, True, True])
+        result2 = tensor[..., correct_mask]
+        self.assertIsInstance(result2, torch.Tensor)
+
+    def test_edge_cases(self):
+        """Test edge cases and boundary conditions."""
+        x, y, z = dims(3)
+
+        # Single dimension tensor
+        vec = torch.randn(5)
+        a = Dim("a")
+        result1 = vec[a]
+        self.assertIsInstance(result1, Tensor)
+        self.assertEqual(a.size, 5)  # Should bind to tensor size
+
+        # Empty tensor indexing
+        empty = torch.empty(0, 3, 4)
+        result2 = empty[x, :]
+        self.assertIsInstance(result2, Tensor)
+
+    def test_error_conditions(self):
+        """Test conditions that should raise errors."""
+        tensor = torch.randn(3, 4)
+        x, y, z = dims(3)
+
+        # Too many indices
+        with self.assertRaises(ValueError):
+            _ = tensor[x, y, z]  # 3 indices for 2D tensor
+
+        # Multiple unbound dim lists
+        dl1 = DimList()
+        dl2 = DimList()
+        with self.assertRaises(Exception):  # Should raise DimensionBindError
+            _ = tensor[dl1, dl2]
+
+        # Multiple ellipsis
+        with self.assertRaises(Exception):
+            _ = tensor[..., x, ...]
+
+    def test_inferred_dimension_binding(self):
+        """Test dimension binding inference with dim packs."""
+        # Skip this test for now as it requires more complex dim pack functionality
+
+    def test_stride_calculation(self):
+        """Test that stride calculations work correctly with dim packs."""
+        tensor = torch.randn(6, 8)
+
+        # Test basic indexing instead of complex dim packs
+        a, b = dims(2)
+        result1 = tensor[a, b]
+        self.assertIsInstance(result1, Tensor)
+
+        # Test with different tensor
+        tensor2 = torch.randn(2, 3, 4)
+        c, d, e = dims(3)
+        result2 = tensor2[c, d, e]
+        self.assertIsInstance(result2, Tensor)
+
+    def test_device_handling_cpu(self):
+        """Test indexing behavior with CPU tensors."""
+        # CPU tensor
+        cpu_tensor = torch.randn(3, 4)
+        x, y = dims(2)
+
+        result_cpu = cpu_tensor[x, y]
+        self.assertIsInstance(result_cpu, Tensor)
+        self.assertEqual(result_cpu.device, torch.device("cpu"))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/functorch/dim/test_split.py b/test/functorch/dim/test_split.py
new file mode 100644
index 0000000000000..12b47c5ab4db6
--- /dev/null
+++ b/test/functorch/dim/test_split.py
@@ -0,0 +1,468 @@
+# Owner(s): ["module: functorch"]
+import unittest
+
+import torch
+from functorch.dim import Dim, dims, Tensor
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+)
+
+
+class TestSplit(TestCase):
+    """Comprehensive tests for first-class dimension split operations."""
+
+    def setUp(self):
+        """Set up common test fixtures."""
+        self.batch, self.height, self.width = dims(3)
+
+    def test_dim_object_split_all_bound(self):
+        """Test split with all Dim objects bound to specific sizes."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Create bound Dim objects
+        d1 = Dim("d1", 3)
+        d2 = Dim("d2", 4)
+        d3 = Dim("d3", 5)
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+
+        # For FCD tensors, check the ordered version to verify shapes
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 3, 5))
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 4, 5))
+        self.assertEqual(result[2].order(x, d3, z).shape, (3, 5, 5))
+
+        # Verify dimensions are bound correctly
+        self.assertEqual(d1.size, 3)
+        self.assertEqual(d2.size, 4)
+        self.assertEqual(d3.size, 5)
+
+    def test_dim_object_split_unbound(self):
+        """Test split with unbound Dim objects."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Create unbound Dim objects
+        d1 = Dim("d1")
+        d2 = Dim("d2")
+        d3 = Dim("d3")
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+
+        # Should split evenly: 12 / 3 = 4 each
+        # Check via ordered tensors since FCD tensors have ndim=0
+        for i, part in enumerate(result):
+            if i == 0:
+                self.assertEqual(part.order(x, d1, z).shape, (3, 4, 5))
+            elif i == 1:
+                self.assertEqual(part.order(x, d2, z).shape, (3, 4, 5))
+            else:
+                self.assertEqual(part.order(x, d3, z).shape, (3, 4, 5))
+
+        # Verify dimensions are bound to chunk size
+        self.assertEqual(d1.size, 4)
+        self.assertEqual(d2.size, 4)
+        self.assertEqual(d3.size, 4)
+
+    def test_dim_object_split_mixed_bound_unbound(self):
+        """Test split with mix of bound and unbound Dim objects."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Create mix of bound and unbound
+        d1 = Dim("d1", 3)  # bound
+        d2 = Dim("d2")  # unbound
+        d3 = Dim("d3", 2)  # bound
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 3, 5))
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 7, 5))  # 12 - 3 - 2 = 7
+        self.assertEqual(result[2].order(x, d3, z).shape, (3, 2, 5))
+
+        # Verify unbound dimension was bound to remaining size
+        self.assertEqual(d2.size, 7)
+
+    def test_dim_object_split_multiple_unbound(self):
+        """Test split with multiple unbound Dim objects."""
+        tensor = torch.randn(3, 15, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Create multiple unbound dimensions
+        d1 = Dim("d1", 3)  # bound
+        d2 = Dim("d2")  # unbound
+        d3 = Dim("d3")  # unbound
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 3, 5))
+
+        # Remaining 12 should be split evenly between d2 and d3: 6 each
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 6, 5))
+        self.assertEqual(result[2].order(x, d3, z).shape, (3, 6, 5))
+
+        self.assertEqual(d2.size, 6)
+        self.assertEqual(d3.size, 6)
+
+    def test_dim_object_split_uneven_remainder(self):
+        """Test split with unbound dimensions that don't divide evenly."""
+        tensor = torch.randn(3, 14, 5)  # 14 doesn't divide evenly by 3
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1 = Dim("d1", 3)
+        d2 = Dim("d2")  # Should get ceil((14-3)/2) = 6
+        d3 = Dim("d3")  # Should get remaining = 5
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 3, 5))
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 6, 5))
+        self.assertEqual(result[2].order(x, d3, z).shape, (3, 5, 5))
+
+        self.assertEqual(d2.size, 6)
+        self.assertEqual(d3.size, 5)
+
+    def test_split_with_dim_object_parameter(self):
+        """Test split when dim parameter is a Dim object."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Use Dim object as the dim parameter
+        d1 = Dim("d1", 3)
+        d2 = Dim("d2", 4)
+        d3 = Dim("d3", 5)
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+
+    def test_error_mixed_types(self):
+        """Test error when mixing integers and Dim objects in split sizes."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1 = Dim("d1", 3)
+
+        # Should raise TypeError for mixed types
+        with self.assertRaises(TypeError):
+            t.split([d1, 4, 5], dim=y)
+
+        with self.assertRaises(TypeError):
+            t.split([3, d1, 5], dim=y)
+
+    def test_error_dim_parameter_with_int_sizes(self):
+        """Test error when dim parameter is Dim but sizes are integers."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Should raise TypeError when dim is Dim object but sizes are ints
+        with self.assertRaises(
+            TypeError,
+            msg="when dim is specified as a Dim object, split sizes must also be dimensions.",
+        ):
+            t.split(3, dim=y)
+
+        with self.assertRaises(
+            TypeError,
+            msg="when dim is specified as a Dim object, split sizes must also be dimensions.",
+        ):
+            t.split([3, 4, 5], dim=y)
+
+    def test_error_size_mismatch(self):
+        """Test error when bound sizes don't match tensor dimension."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Bound dimensions that sum to wrong total
+        d1 = Dim("d1", 3)
+        d2 = Dim("d2", 4)
+        d3 = Dim("d3", 6)  # 3 + 4 + 6 = 13, but tensor has 12
+
+        with self.assertRaises(TypeError):
+            t.split([d1, d2, d3], dim=y)
+
+    def test_error_bound_sizes_exceed_tensor(self):
+        """Test error when bound sizes exceed tensor dimension."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Bound dimensions with one unbound, but bound sizes too large
+        d1 = Dim("d1", 8)
+        d2 = Dim("d2", 6)  # 8 + 6 = 14 > 12
+        d3 = Dim("d3")
+
+        with self.assertRaises(TypeError):
+            t.split([d1, d2, d3], dim=y)
+
+    def test_error_nonexistent_dimension(self):
+        """Test error when splitting on non-existent dimension."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        w = Dim("w")  # Not in tensor
+
+        with self.assertRaises(TypeError):
+            t.split([Dim("d1"), Dim("d2")], dim=w)
+
+    def test_split_different_dims(self):
+        """Test splitting along different dimensions."""
+        tensor = torch.randn(6, 8, 10)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Split along first dimension
+        a, b = Dim("a", 2), Dim("b", 4)
+        result1 = t.split([a, b], dim=x)
+        self.assertEqual(len(result1), 2)
+        self.assertEqual(result1[0].order(a, y, z).shape, (2, 8, 10))
+        self.assertEqual(result1[1].order(b, y, z).shape, (4, 8, 10))
+
+        # Split along last dimension
+        c, d = Dim("c", 3), Dim("d", 7)
+        result2 = t.split([c, d], dim=z)
+        self.assertEqual(len(result2), 2)
+        self.assertEqual(result2[0].order(x, y, c).shape, (6, 8, 3))
+        self.assertEqual(result2[1].order(x, y, d).shape, (6, 8, 7))
+
+    def test_split_single_dim_object(self):
+        """Test split with single Dim object that matches tensor dimension size."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Use a single Dim object with size matching the dimension
+        d1 = Dim("d1", 12)  # Must match the full size of y dimension
+
+        # Single Dim object in list should work when size matches
+        result = t.split([d1], dim=y)
+        self.assertEqual(len(result), 1)  # Single chunk containing entire dimension
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 12, 5))
+
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "TorchDynamo doesn't preserve side effects during tracing",
+    )
+    def test_dimension_binding_consistency(self):
+        """Test that split properly binds dimensions and they remain consistent."""
+        tensor = torch.randn(3, 15, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1 = Dim("d1")
+        d2 = Dim("d2")
+        d3 = Dim("d3")
+
+        # Split should bind dimensions
+        t.split([d1, d2, d3], dim=y)
+
+        # Use the bound dimensions in another operation
+        self.assertTrue(d1.is_bound)
+        self.assertTrue(d2.is_bound)
+        self.assertTrue(d3.is_bound)
+
+        # Dimensions should remain bound with same values
+        original_sizes = (d1.size, d2.size, d3.size)
+
+        # Try to use bound dimension again - should maintain same size
+        another_tensor = torch.randn(original_sizes[0], 4)
+        a = Dim("a")
+        t2 = another_tensor[d1, a]  # d1 should still be bound to same size
+        self.assertEqual(t2.order(d1, a).shape, (original_sizes[0], 4))
+
+    def test_split_result_tensor_types(self):
+        """Test that split results are proper first-class dimension tensors."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1 = Dim("d1", 4)
+        d2 = Dim("d2", 8)
+
+        result = t.split([d1, d2], dim=y)
+
+        # Results should be first-class dimension tensors
+        for part in result:
+            self.assertTrue(isinstance(part, (torch.Tensor, Tensor)))
+
+            # Should have dimensions from original tensor plus new split dimensions
+            if hasattr(part, "dims"):
+                # Check that the split dimension is in the result
+                dims_in_result = part.dims
+                self.assertTrue(len(dims_in_result) > 0)
+
+    def test_large_tensor_split(self):
+        """Test split on larger tensors to verify performance and correctness."""
+        tensor = torch.randn(10, 100, 20)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Split into many small pieces
+        split_dims = [Dim(f"d{i}", 5) for i in range(20)]  # 20 * 5 = 100
+
+        result = t.split(split_dims, dim=y)
+        self.assertEqual(len(result), 20)
+
+        for i, part in enumerate(result):
+            self.assertEqual(part.order(x, split_dims[i], z).shape, (10, 5, 20))
+            self.assertEqual(split_dims[i].size, 5)
+
+    def test_device_handling(self):
+        """Test split behavior with different devices."""
+        if torch.cuda.is_available():
+            # Test on CUDA
+            cuda_tensor = torch.randn(3, 12, 5, device="cuda")
+            x, y, z = dims(3)
+            t = cuda_tensor[x, y, z]
+
+            d1, d2 = Dim("d1", 4), Dim("d2", 8)
+            result = t.split([d1, d2], dim=y)
+
+            for i, part in enumerate(result):
+                ordered = part.order(x, d1 if i == 0 else d2, z)
+                self.assertEqual(ordered.device.type, "cuda")
+                self.assertEqual(ordered.shape[0], 3)
+                self.assertEqual(ordered.shape[2], 5)
+
+        # Test on CPU
+        cpu_tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = cpu_tensor[x, y, z]
+
+        d1, d2 = Dim("d1", 4), Dim("d2", 8)
+        result = t.split([d1, d2], dim=y)
+
+        for i, part in enumerate(result):
+            ordered = part.order(x, d1 if i == 0 else d2, z)
+            self.assertEqual(ordered.device, torch.device("cpu"))
+
+    def test_split_preserves_dtype(self):
+        """Test that split preserves tensor dtype."""
+        for dtype in [torch.float32, torch.float64, torch.int32, torch.int64]:
+            if dtype in [torch.int32, torch.int64]:
+                tensor = torch.randint(0, 10, (3, 12, 5), dtype=dtype)
+            else:
+                tensor = torch.randn(3, 12, 5, dtype=dtype)
+            x, y, z = dims(3)
+            t = tensor[x, y, z]
+
+            d1, d2 = Dim("d1", 4), Dim("d2", 8)
+            result = t.split([d1, d2], dim=y)
+
+            for i, part in enumerate(result):
+                ordered = part.order(x, d1 if i == 0 else d2, z)
+                self.assertEqual(ordered.dtype, dtype)
+
+    def test_split_with_requires_grad(self):
+        """Test split with tensors that require gradients."""
+        tensor = torch.randn(3, 12, 5, requires_grad=True)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1, d2 = Dim("d1", 4), Dim("d2", 8)
+        result = t.split([d1, d2], dim=y)
+
+        for part in result:
+            # Check requires_grad on the ordered tensor to access the underlying tensor properties
+            self.assertTrue(
+                part.order(x, d1 if part is result[0] else d2, z).requires_grad
+            )
+
+    def test_edge_case_single_element_splits(self):
+        """Test splitting into single-element chunks."""
+        tensor = torch.randn(3, 5, 4)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Split into 5 single-element pieces
+        split_dims = [Dim(f"d{i}", 1) for i in range(5)]
+
+        result = t.split(split_dims, dim=y)
+        self.assertEqual(len(result), 5)
+
+        for i, part in enumerate(result):
+            self.assertEqual(part.order(x, split_dims[i], z).shape, (3, 1, 4))
+
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "TorchDynamo has issues with torch._tensor.split"
+    )
+    def test_split_function_directly(self):
+        """Test that the standalone split function works correctly."""
+        from functorch.dim import split
+
+        # Test on regular tensor
+        tensor = torch.randn(3, 12, 5)
+        result = split(tensor, 4, dim=1)
+        self.assertEqual(len(result), 3)  # 12 / 4 = 3
+        for part in result:
+            self.assertEqual(part.shape, (3, 4, 5))
+
+        # Test on FCD tensor with FCD arguments
+        x, y, z = dims(3)
+        fcd_tensor = tensor[x, y, z]
+
+        d1 = Dim("d1", 4)
+        d2 = Dim("d2", 8)
+        result = split(fcd_tensor, [d1, d2], dim=y)
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 4, 5))
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 8, 5))
+
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "TorchDynamo can't parse dims() without arguments from bytecode",
+    )
+    def test_split_on_plain_tensor_with_fcd_args(self):
+        """Test that split() works on plain tensors when FCD arguments are provided."""
+        # Test the exact example from the user message
+        x, y = dims()
+
+        # Split a plain tensor with FCD dimensions as split sizes
+        result = torch.randn(8).split([x, y], dim=0)
+        self.assertEqual(len(result), 2)
+
+        # Both parts should be FCD tensors
+        for part in result:
+            self.assertTrue(isinstance(part, (torch.Tensor, Tensor)))
+            self.assertTrue(hasattr(part, "dims"))
+
+        # Check that the dimensions are bound correctly
+        self.assertIs(result[0].dims[0], x)
+        self.assertIs(result[1].dims[0], y)
+        self.assertEqual(x.size, 4)  # 8 / 2 = 4 each
+        self.assertEqual(y.size, 4)
+
+        # Test with repeated dimensions
+        x2 = Dim("x2")
+        result2 = torch.randn(8).split([x2, x2], dim=0)
+        self.assertEqual(len(result2), 2)
+        self.assertEqual(x2.size, 4)  # Both chunks should be size 4
+
+    def test_plain_tensor_regular_split_still_works(self):
+        """Test that regular split on plain tensors still works without FCD args."""
+        tensor = torch.randn(3, 12, 5)
+
+        # Regular split without any FCD arguments should work normally
+        result = tensor.split(4, dim=1)
+        self.assertEqual(len(result), 3)  # 12 / 4 = 3
+        for part in result:
+            self.assertEqual(part.shape, (3, 4, 5))
+            self.assertTrue(isinstance(part, torch.Tensor))
+            self.assertFalse(hasattr(part, "dims"))  # Should be regular tensor
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/functorch/test_ac_logging.py b/test/functorch/test_ac_logging.py
index 03ddb7d45842f..cb65f028a00f3 100644
--- a/test/functorch/test_ac_logging.py
+++ b/test/functorch/test_ac_logging.py
@@ -37,6 +37,7 @@ def setUp(self) -> None:
         self.recomputable_node_idxs: list[int] = []
         self.expected_runtime: int = 100
         self.memories_banned_nodes: list[int] = [50]
+        self.normalized_memories_banned_nodes: list[float] = [0.10344827586206896]
         self.runtimes_banned_nodes: list[int] = [10]
         self.min_cut_saved_values: list[Node] = [self.node1]
 
@@ -93,21 +94,23 @@ def test_create_activation_checkpointing_logging_structure_payload(self) -> None
             "Expected Runtime": self.expected_runtime,
             "Knapsack Saved Nodes": self.saved_node_idxs,
             "Knapsack Recomputed Nodes": self.recomputable_node_idxs,
-            "Knapsack Input Memories": self.memories_banned_nodes,
+            "Knapsack Input Memories": self.normalized_memories_banned_nodes,
+            "Absolute Memories": self.memories_banned_nodes,
             "Knapsack Input Runtimes": self.runtimes_banned_nodes,
             "Min Cut Solution Saved Values": ["node1"],
         }
         result = create_activation_checkpointing_logging_structure_payload(
-            self.graph,
-            input_joint_graph_node_information,
-            joint_graph_edges,
-            self.all_recomputable_banned_nodes,
-            self.expected_runtime,
-            self.saved_node_idxs,
-            self.recomputable_node_idxs,
-            self.memories_banned_nodes,
-            self.runtimes_banned_nodes,
-            self.min_cut_saved_values,
+            joint_graph=self.graph,
+            joint_graph_node_information=input_joint_graph_node_information,
+            joint_graph_edges=joint_graph_edges,
+            all_recomputable_banned_nodes=self.all_recomputable_banned_nodes,
+            expected_runtime=self.expected_runtime,
+            saved_node_idxs=self.saved_node_idxs,
+            recomputable_node_idxs=self.recomputable_node_idxs,
+            memories_banned_nodes=self.memories_banned_nodes,
+            normalized_memories_banned_nodes=self.normalized_memories_banned_nodes,
+            runtimes_banned_nodes=self.runtimes_banned_nodes,
+            min_cut_saved_values=self.min_cut_saved_values,
         )
         self.assertEqual(result, expected_payload)
 
@@ -119,14 +122,15 @@ def test_create_structured_trace_for_min_cut_info(
         self, mock_json_dumps: MagicMock, mock_trace_structured: MagicMock
     ) -> None:
         create_structured_trace_for_min_cut_info(
-            self.graph,
-            self.all_recomputable_banned_nodes,
-            self.saved_node_idxs,
-            self.recomputable_node_idxs,
-            self.expected_runtime,
-            self.memories_banned_nodes,
-            self.runtimes_banned_nodes,
-            self.min_cut_saved_values,
+            joint_graph=self.graph,
+            all_recomputable_banned_nodes=self.all_recomputable_banned_nodes,
+            saved_node_idxs=self.saved_node_idxs,
+            recomputable_node_idxs=self.recomputable_node_idxs,
+            expected_runtime=self.expected_runtime,
+            memories_banned_nodes=self.memories_banned_nodes,
+            normalized_memories_banned_nodes=self.normalized_memories_banned_nodes,
+            runtimes_banned_nodes=self.runtimes_banned_nodes,
+            min_cut_saved_values=self.min_cut_saved_values,
         )
 
         self.assertEqual(mock_trace_structured.call_count, 1)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 5e8902b0aa8fb..0f697acc886a4 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -5966,6 +5966,219 @@ def test_autocast(self):
             res = aot_mod(x)
         res.sum().backward()
 
+    def test_quantize_activation_duplicate_nodes(self):
+        """Test both quantize_activation_fw and quantize_activation_bw handle duplicate nodes correctly"""
+        import torch.fx as fx
+        from torch._functorch.partitioners import (
+            quantize_activation_bw,
+            quantize_activation_fw,
+        )
+        from torch._subclasses.fake_tensor import extract_tensor_metadata
+
+        # Mock the inductor config
+        with patch.dict(
+            "torch._inductor.config.post_grad_fusion_options",
+            {
+                "activation_quantization_aten_pass": {
+                    "allowed_dtypes": "torch.bfloat16",
+                    "size_in_mb": 1,
+                    "use_scaling": True,
+                    "exclude_primals": False,
+                    "skip_dynamo_guards": True,
+                    "quantize_dynamic_shape": False,
+                    "quant_type": "torch.float16",  # float8_e5m2 must be GPU
+                }
+            },
+        ):
+            # Test Forward Graph with duplicate nodes
+            fwd_graph = fx.Graph()
+
+            # Create input nodes
+            x = fwd_graph.placeholder("x")
+            x.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            x.meta["tensor_meta"] = extract_tensor_metadata(x.meta["val"])
+
+            y = fwd_graph.placeholder("y")
+            y.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            y.meta["tensor_meta"] = extract_tensor_metadata(y.meta["val"])
+
+            # Create a computation node that will be duplicated in outputs
+            mul_node = fwd_graph.call_function(torch.ops.aten.mul.Tensor, (x, y))
+            mul_node.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            mul_node.meta["tensor_meta"] = extract_tensor_metadata(mul_node.meta["val"])
+            mul_node.meta["saved_for_quantization"] = True
+
+            # Create another node
+            add_node = fwd_graph.call_function(torch.ops.aten.add.Tensor, (x, y))
+            add_node.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            add_node.meta["tensor_meta"] = extract_tensor_metadata(add_node.meta["val"])
+
+            # Create output with DUPLICATE nodes - mul_node appears at positions 0 and 2
+            fwd_graph.output((mul_node, add_node, mul_node))
+
+            # Test the forward quantization function
+            quantize_activation_fw(fwd_graph)
+
+            # Get the forward output node
+            fwd_output_node = fwd_graph.find_nodes(op="output")[0]
+            fwd_output_args = fwd_output_node.args[0]
+
+            # Verify forward graph has the correct structure
+            self.assertGreaterEqual(
+                len(fwd_output_args), 3, "Should have at least the original 3 outputs"
+            )
+
+            # Check that positions 0 and 2 reuse the same quantized node
+            pos_0_node = fwd_output_args[0]
+            pos_2_node = fwd_output_args[2]
+
+            # Both should be quantized nodes
+            self.assertTrue(
+                pos_0_node.name.startswith("fp8_quant_"),
+                f"Position 0 should be quantized node, got: {pos_0_node.name}",
+            )
+            self.assertTrue(
+                pos_2_node.name.startswith("fp8_quant_"),
+                f"Position 2 should be quantized node, got: {pos_2_node.name}",
+            )
+
+            # The shared quantized node should have the first occurrence position in its name
+            self.assertIn(
+                "_pos_0",
+                pos_0_node.name,
+                f"Shared quantized node should have '_pos_0' in name: {pos_0_node.name}",
+            )
+            self.assertIn(
+                "_pos_2",
+                pos_2_node.name,
+                f"Shared quantized node should have '_pos_2' in name: {pos_2_node.name}",
+            )
+            # Find scale nodes in the forward output
+            fwd_scale_nodes = [
+                node for node in fwd_output_args if "fp8_scale_" in node.name
+            ]
+            self.assertEqual(
+                len(fwd_scale_nodes),
+                2,
+                "Should have exactly 2 scale node (shared for both quantized instances)",
+            )
+
+            # Test Backward Graph with duplicate nodes
+            bwd_graph = fx.Graph()
+
+            # Create backward placeholders corresponding to forward outputs
+            quant_input1 = bwd_graph.placeholder("fp8_quant_pos_0_mul_tensor")
+            quant_input1.meta["val"] = torch.randn(100, 100, dtype=torch.float16)
+            quant_input1.meta["tensor_meta"] = extract_tensor_metadata(
+                quant_input1.meta["val"]
+            )
+            quant_input1.meta["saved_for_quantization"] = True
+            quant_input1.meta["dequant_type"] = torch.bfloat16
+
+            add_input = bwd_graph.placeholder("add")
+            add_input.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            add_input.meta["tensor_meta"] = extract_tensor_metadata(
+                add_input.meta["val"]
+            )
+
+            quant_input2 = bwd_graph.placeholder("fp8_quant_pos_2_mul_tensor")
+            quant_input2.meta["val"] = torch.randn(100, 100, dtype=torch.float16)
+            quant_input2.meta["tensor_meta"] = extract_tensor_metadata(
+                quant_input2.meta["val"]
+            )
+            quant_input2.meta["saved_for_quantization"] = True
+            quant_input2.meta["dequant_type"] = torch.bfloat16
+
+            # Add scale node (would come from forward)
+            scale_input = bwd_graph.placeholder("fp8_scale_pos_0_mul_tensor")
+            scale_input.meta["val"] = torch.randn(100, 100, dtype=torch.float32)
+            scale_input.meta["tensor_meta"] = extract_tensor_metadata(
+                scale_input.meta["val"]
+            )
+
+            scale_input2 = bwd_graph.placeholder("fp8_scale_pos_2_mul_tensor")
+            scale_input2.meta["val"] = torch.randn(100, 100, dtype=torch.float32)
+            scale_input2.meta["tensor_meta"] = extract_tensor_metadata(
+                scale_input.meta["val"]
+            )
+            # Create some backward computation using both quantized inputs
+            grad_output1 = bwd_graph.placeholder("tangents_1")
+            grad_output1.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            grad_output1.meta["tensor_meta"] = extract_tensor_metadata(
+                grad_output1.meta["val"]
+            )
+
+            grad_output2 = bwd_graph.placeholder("tangents_2")
+            grad_output2.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            grad_output2.meta["tensor_meta"] = extract_tensor_metadata(
+                grad_output2.meta["val"]
+            )
+
+            # Create backward operations using the quantized inputs
+            mul_bwd1 = bwd_graph.call_function(
+                torch.ops.aten.mul.Tensor, (quant_input1, grad_output1)
+            )
+            mul_bwd1.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            mul_bwd1.meta["tensor_meta"] = extract_tensor_metadata(mul_bwd1.meta["val"])
+
+            mul_bwd2 = bwd_graph.call_function(
+                torch.ops.aten.mul.Tensor, (quant_input2, grad_output2)
+            )
+            mul_bwd2.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            mul_bwd2.meta["tensor_meta"] = extract_tensor_metadata(mul_bwd2.meta["val"])
+
+            # Create output
+            bwd_graph.output((mul_bwd1, mul_bwd2))
+
+            # Test the backward quantization function
+            quantize_activation_bw(bwd_graph)
+
+            # Verify backward graph processing
+            bwd_placeholders = list(bwd_graph.find_nodes(op="placeholder"))
+            quantized_placeholders = [
+                p for p in bwd_placeholders if "fp8_quant_" in p.name
+            ]
+            scale_placeholders = [p for p in bwd_placeholders if "fp8_scale_" in p.name]
+
+            # Should have processed the quantized placeholders
+            self.assertGreater(
+                len(quantized_placeholders), 0, "Should have quantized placeholders"
+            )
+            self.assertGreater(
+                len(scale_placeholders), 0, "Should have scale placeholders"
+            )
+
+            # Check that dequantization operations were added
+            dequant_operations = [
+                node
+                for node in bwd_graph.nodes
+                if node.op == "call_function"
+                and "convert_element_type" in str(node.target)
+            ]
+
+            # Should have dequantization operations for each quantized input that was processed
+            self.assertGreater(
+                len(dequant_operations),
+                0,
+                "Should have dequantization operations in backward graph",
+            )
+
+            # Verify the backward graph users were properly updated
+            for quant_placeholder in quantized_placeholders:
+                # The quantized placeholder should not be directly used in final operations
+                # (it should be replaced by dequantized versions)
+                direct_users = [
+                    user
+                    for user in quant_placeholder.users
+                    if user.op == "call_function" and "mul" in str(user.target)
+                ]
+                # Direct usage should be minimal (only for dequantization chain)
+                self.assertLessEqual(
+                    len(direct_users),
+                    1,
+                    f"Quantized placeholder {quant_placeholder.name} should have minimal direct users",
+                )
+
 
 class TestAOTDispatch(AOTTestCase):
     # Tests to add cases for (non-exhaustive list, mostly for my notes):
@@ -7818,9 +8031,6 @@ def fn(x):
     xfail(
         "nn.functional.fractional_max_pool3d", ""
     ),  # rand() received an invalid combination of arguments - g...
-    xfail(
-        "nn.functional.group_norm", ""
-    ),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail("trace", ""),  # Cannot call sizes() on tensor with symbolic sizes/strides
     decorate(
         "linalg.householder_product",
@@ -7987,8 +8197,6 @@ def test_aot_autograd_symbolic_exhaustive(self, device, dtype, op):
     torch.nn.Transformer,  # DataDependentOutputException: aten.equal compares a mask input to a mask producing a bool
     torch.nn.TransformerEncoder,  # DataDependentOutputException: aten.equal compares a mask input to a mask producing a bool
     torch.nn.GaussianNLLLoss,  # NotImplementedError: local_scalar_dense/item NYI for torch.bool
-    torch.nn.GroupNorm,  # in native_group_norm_backward cpg, _rem = divmod(C, group)
-    # TypeError: unsupported operand type(s) for divmod(): 'SymInt' and 'int'
     torch.nn.FractionalMaxPool3d,  # int() argument must be a string, a bytes-like object or a number, not 'SymFloat'
     torch.nn.BCELoss,  # new_size = _infer_size(target.size(), weight.size())
     # RuntimeError: expected int at position 0, but got: SymInt
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 68a326f4f35a0..e103aeb97c919 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -2007,7 +2007,6 @@ def test_scan_complex_pytree(self, reverse, compile_mode, device, autograd):
     # Fails with: AssertionError: scan is not an OpOverload
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
-    @unittest.expectedFailure
     def test_scan_associative_scan(self):
         combine_mode = "generic"
         compile_mode_scan = "compile"
@@ -3744,7 +3743,17 @@ def _run_test(self, model, model_fake, inputs, autograd_param=None):
         if autograd_param is not None and any(
             par.requires_grad for par in autograd_param
         ):
-            self._check_autograd(result, result_exp, autograd_param)
+            result_flat = pytree.tree_leaves(result)
+            result_exp_flat = pytree.tree_leaves(result_exp)
+            exp_grad_mask = [
+                True if r.requires_grad else False for r in result_exp_flat
+            ]
+
+            self._check_autograd(
+                [r for r, m in zip(result_flat, exp_grad_mask) if m],
+                [r for r, m in zip(result_exp_flat, exp_grad_mask) if m],
+                autograd_param,
+            )
 
         # Return the result of the functions under test for further investigations
         return result
@@ -5066,6 +5075,115 @@ def fct_pointwise(x, y):
             autograd_param=None if not autograd else (*pytree.tree_leaves(inp),),
         )
 
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("combine_mode", ["pointwise", "generic"])
+    @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
+    @parametrize("reverse", [False, True])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    # Skipping the combination of combine_mode=pointwise and device=cpu
+    # as the current implementation of pointwise does only support CUDA device
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+    @decorateIf(
+        unittest.skip,
+        lambda params: (
+            params["combine_mode"] == "pointwise"
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+        ),
+    )
+    def test_associative_scan_partial_grad(
+        self, combine_mode, compile_mode, reverse, device
+    ):
+        import random
+
+        n_params = 6
+        autograds = []
+        autograds.append([True, True, True, True, True, True])
+        autograds.append([False, False, False, False, False, False])
+        autograds.append([False, True, False, False, False, False])
+        for _ in range(5):
+            autograds.append([bool(random.randint(0, 1)) for _ in range(n_params)])
+
+        def mul2(x, y):
+            return (*[xv * yv for xv, yv in zip(x, y)],)
+
+        for a_grads in autograds:
+            inp = tuple(
+                [
+                    torch.randn(10, 3, 2, device=device, requires_grad=a_grads[n])
+                    for n in range(n_params)
+                ]
+            )
+
+            kwargs = {
+                "dim": 0,
+                "reverse": reverse,
+                "compile_mode": compile_mode,
+                "combine_fn": mul2,
+                "combine_mode": combine_mode,
+            }
+            kwargs_fake = self._prepare_fake_kwargs(kwargs)
+            self._run_test(
+                model=AssociativeScanModels.CombineFn(**kwargs),
+                model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
+                inputs=inp,
+                autograd_param=inp,
+            )
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("combine_mode", ["pointwise", "generic"])
+    @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
+    @parametrize("reverse", [False, True])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    # Skipping the combination of combine_mode=pointwise and device=cpu
+    # as the current implementation of pointwise does only support CUDA device
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+    @decorateIf(
+        unittest.skip,
+        lambda params: (
+            params["combine_mode"] == "pointwise"
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+        ),
+    )
+    def test_associative_scan_partial_grad_no_grad(
+        self, combine_mode, compile_mode, reverse, device
+    ):
+        def mul_single_nograd(x, y):
+            xy1 = x[0] * y[0]
+            with torch.no_grad():
+                xy2 = x[1] * y[1]
+            return xy1, xy2
+
+        inp = tuple(
+            [torch.randn(10, 3, 2, device=device, requires_grad=True) for n in range(2)]
+        )
+
+        kwargs = {
+            "dim": 0,
+            "reverse": reverse,
+            "compile_mode": compile_mode,
+            "combine_fn": mul_single_nograd,
+            "combine_mode": combine_mode,
+        }
+        kwargs_fake = self._prepare_fake_kwargs(kwargs)
+        self._run_test(
+            model=AssociativeScanModels.CombineFn(**kwargs),
+            model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
+            inputs=inp,
+            autograd_param=inp[0:1],
+        )
+
     @unittest.skipIf(not SM70OrLater, "triton")
     def test_associative_scan_sparse_tensor(self):
         x = torch.tensor(
diff --git a/test/functorch/test_dims.py b/test/functorch/test_dims.py
index 424321e9358ff..eb5202d4bb2ef 100644
--- a/test/functorch/test_dims.py
+++ b/test/functorch/test_dims.py
@@ -10,17 +10,9 @@
 from attn_ft import BertSelfAttention as BertSelfAttentionA, Linear
 from attn_positional import BertSelfAttention as BertSelfAttentionB
 
+import functorch.dim
 import torch
-from functorch._C import dim as _C
-from functorch.dim import (
-    Dim,
-    DimensionBindError,
-    DimList,
-    dimlists,
-    dims,
-    stack,
-    Tensor,
-)
+from functorch.dim import Dim, DimList, dimlists, dims, stack, Tensor
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfTorchDynamo,
@@ -34,12 +26,6 @@
 except ImportError:
     resnet18 = None
 
-_test_c, _parse_test, _set_pointwise_optimize = (
-    _C._test_c,
-    _C._parse_test,
-    _C._set_pointwise_optimize,
-)
-
 from contextlib import contextmanager
 from time import perf_counter
 
@@ -412,11 +398,6 @@ def test_hello(self):
         torch.testing.assert_close(
             A[c + 1, c + 0].order(c), A[torch.arange(2) + 1, torch.arange(2)]
         )
-        try:
-            A[..., 3, ...]
-            raise NotImplementedError
-        except DimensionBindError:
-            pass
 
         C = torch.rand(4, 7)
         c_, x, y, z = dims()
@@ -493,9 +474,6 @@ def test_compare_dims(self):
         j.size = 4
         (i < j)  # noqa: B015
 
-    def test_c(self):
-        _test_c()
-
     def test_seg(self):
         i, k = dims()
         i.size = 4
@@ -507,23 +485,6 @@ def test_expand(self):
         i = dims()
         self.assertEqual(list(A[i].expand(2, 4).order(i).size()), [3, 2, 4])
 
-    def test_parse(self):
-        self.assertEqual(("x", None, None, None), _parse_test(1, 0, "x"))
-        self.assertEqual(("x", None, "y", None), _parse_test(1, 0, "x", c="y"))
-        self.assertEqual(("x", None, "y", "z"), _parse_test(1, 0, "x", d="z", c="y"))
-
-        self.assertEqual(("x", "4", None, None), _parse_test(2, 0, "x", b="4"))
-        self.assertEqual(("x", "y", "z", "q"), _parse_test(2, 0, "x", "y", "z", "q"))
-        with self.assertRaises(TypeError):
-            _parse_test(2, 0, "x", "y", "z", "q", "5")
-        with self.assertRaises(TypeError):
-            _parse_test(2, 0, "x", "y", b="y")
-
-        with self.assertRaises(TypeError):
-            _parse_test(2, 0, "x", c="y")
-        with self.assertRaises(TypeError):
-            _parse_test(2, 0, "x")
-
     def test_network(self):
         if resnet18 is None:
             self.skipTest("no torchvision")
@@ -716,10 +677,10 @@ def test_big_split(self):
 class TestMinFunctorchOnly(TestMin):
     def setUp(self):
         super().setUp()
-        _set_pointwise_optimize(False)
+        functorch.dim.POINTWISE_OPTIMIZE = False
 
     def tearDown(self):
-        _set_pointwise_optimize(True)
+        functorch.dim.POINTWISE_OPTIMIZE = True
         super().tearDown()
 
 
diff --git a/test/fx/quantization.py b/test/fx/quantization.py
index 33550702ca6c7..96c100ef82ffa 100644
--- a/test/fx/quantization.py
+++ b/test/fx/quantization.py
@@ -341,7 +341,7 @@ def copy_recursive(node):
                     lambda a: map_arg(a, lambda n: load_arg(n, quantized=True)),
                 )
                 if r is NotImplemented:
-                    # quantizer choose to to quantize the node take the entire match, and just copy it over
+                    # quantizer choose to quantize the node take the entire match, and just copy it over
                     env[node.name] = copy_recursive(node)
                 else:
                     quant_env[node.name] = r
diff --git a/test/fx/test_fx_split.py b/test/fx/test_fx_split.py
index 6d95592fd290e..7338dd0314a15 100644
--- a/test/fx/test_fx_split.py
+++ b/test/fx/test_fx_split.py
@@ -1,12 +1,28 @@
 # Owner(s): ["module: fx"]
 
+import dataclasses
 from collections import defaultdict
 
 import torch
+import torch.fx.passes.operator_support as op_support
+import torch.fx.passes.splitter_base as splitter_base
 from torch.fx.passes.split_utils import split_by_tags
 from torch.testing._internal.common_utils import TestCase
 
 
+@torch.jit.script
+@dataclasses.dataclass
+class DummyDataClass:
+    a: int
+    b: int
+    c: int
+
+
+@torch.fx.wrap
+def wrapped_add(_dataclass, y):
+    return _dataclass.c + y
+
+
 class TestFXSplit(TestCase):
     def test_split_preserve_node_meta(self):
         class TestModule(torch.nn.Module):
@@ -38,6 +54,65 @@ def forward(self, x, y):
                 self.assertIn("name", node.meta)
                 self.assertEqual(node.meta["name"], node.name)
 
+    def test_dataclass_as_graph_entry(self):
+        """
+        Test that splitting works when the graph entry is a dataclass instance
+        and a wrapped function is called with it, resulting in a call_function
+        node with no input dependencies. This tests the edge case fixed in D81232435
+        where call_function nodes with no dependencies should be handled properly
+        in the starter_nodes() method.
+
+        Graph visualization:
+        y (input)    DummyDataClass(2,3,4) (no input deps, result as a call_function_node)
+            \              /
+             \            /
+              wrapped_add
+                  |
+              z (output)
+        """  # noqa: W605
+
+        class TestModuleWithFunctionEntry(torch.nn.Module):
+            def forward(self, y):
+                # This creates a call_function node with no input dependencies
+                dummy_data_class = DummyDataClass(2, 3, 4)
+                z = wrapped_add(dummy_data_class, y)
+                return z
+
+        mod = TestModuleWithFunctionEntry()
+        gm = torch.fx.symbolic_trace(mod)
+
+        # Create custom operator support to mark wrapped_add as supported
+        class CustomOpSupport(op_support.OperatorSupportBase):
+            def is_node_supported(self, submodules, node) -> bool:
+                return node.target == wrapped_add
+
+        # Create a simple splitter to test the edge case
+        class TestSplitter(splitter_base._SplitterBase):
+            def __init__(self, module, sample_input, operator_support):
+                settings = splitter_base._SplitterSettingBase()
+                super().__init__(module, sample_input, operator_support, settings)
+
+        # Create splitter instance - this tests the fix where call_function nodes
+        # with no input dependencies are properly handled in starter_nodes()
+        splitter = TestSplitter(
+            module=gm,
+            sample_input=[torch.randn(2, 3)],
+            operator_support=CustomOpSupport(),
+        )
+
+        # This should not raise an exception (tests the fix from D81232435)
+        # The fix allows call_function nodes with no dependencies as valid starter nodes
+        split_result = splitter()
+
+        # Verify the splitting worked correctly
+        self.assertIsNotNone(split_result)
+
+        # Test that the split module produces the same result as the original
+        test_input = torch.randn(2, 3)
+        original_result = mod(test_input)
+        split_module_result = split_result(test_input)
+        self.assertTrue(torch.equal(original_result, split_module_result))
+
 
 class TestSplitByTags(TestCase):
     class TestModule(torch.nn.Module):
diff --git a/test/fx/test_matcher_utils.py b/test/fx/test_matcher_utils.py
index 604de73fcd880..d046fccf1f50e 100644
--- a/test/fx/test_matcher_utils.py
+++ b/test/fx/test_matcher_utils.py
@@ -6,7 +6,7 @@
 
 import torch
 import torch.nn.functional as F
-from torch.export import export_for_training
+from torch.export import export
 from torch.fx import symbolic_trace
 from torch.fx.experimental.proxy_tensor import make_fx
 
@@ -172,7 +172,7 @@ def pattern(x, weight):
             torch.randn(1, 3, 3, 3) * 10,
             torch.randn(3, 3, 3, 3),
         )
-        pattern_gm = export_for_training(
+        pattern_gm = export(
             WrapperModule(pattern), example_inputs, strict=True
         ).module()
         before_split_res = pattern_gm(*example_inputs)
@@ -203,11 +203,11 @@ def pattern(x, weight):
             torch.randn(1, 3, 3, 3) * 10,
             torch.randn(3, 3, 3, 3),
         )
-        pattern_gm = export_for_training(
+        pattern_gm = export(
             WrapperModule(pattern), example_inputs, strict=True
         ).module()
         matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
-        target_gm = export_for_training(
+        target_gm = export(
             WrapperModule(target_graph), example_inputs, strict=True
         ).module()
         internal_matches = matcher.match(target_gm.graph)
@@ -248,11 +248,9 @@ def forward(self, x):
                 return linear, {"linear": linear, "x": x}
 
         example_inputs = (torch.randn(3, 5),)
-        pattern_gm = export_for_training(
-            Pattern(), example_inputs, strict=True
-        ).module()
+        pattern_gm = export(Pattern(), example_inputs, strict=True).module()
         matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
-        target_gm = export_for_training(M(), example_inputs, strict=True).module()
+        target_gm = export(M(), example_inputs, strict=True).module()
         internal_matches = matcher.match(target_gm.graph)
         for internal_match in internal_matches:
             name_node_map = internal_match.name_node_map
diff --git a/test/higher_order_ops/test_local_map.py b/test/higher_order_ops/test_local_map.py
new file mode 100644
index 0000000000000..10c802b3c98e9
--- /dev/null
+++ b/test/higher_order_ops/test_local_map.py
@@ -0,0 +1,427 @@
+# Owner(s): ["module: higher order operators"]
+# flake8: noqa: B950
+
+
+import contextlib
+import functools
+import unittest
+
+import torch
+import torch._dynamo
+import torch._functorch
+import torch._inductor
+import torch._inductor.decomposition
+import torch.nn.functional as F
+from torch import nn
+from torch._dynamo.variables.higher_order_ops import LocalMapWrappedHigherOrderVariable
+from torch.nn.attention import sdpa_kernel, SDPBackend
+from torch.utils.checkpoint import create_selective_checkpoint_contexts
+
+
+if torch.distributed.is_available():
+    from torch.distributed._tensor.experimental import local_map
+    from torch.distributed.tensor.placement_types import Replicate, Shard
+
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_WITH_CROSSREF,
+    TEST_WITH_TORCHDYNAMO,
+    TEST_WITH_TORCHINDUCTOR,
+    TestCase,
+)
+
+
+nested_compile_region = torch.compiler.nested_compile_region
+
+
+def get_skip_reasons():
+    msg = ""
+    if not torch.distributed.is_available():
+        msg += "Torch distributed not available. "
+    if TEST_WITH_TORCHINDUCTOR or TEST_WITH_TORCHDYNAMO:
+        msg += "Already manually torch.compile'd. "
+
+    return msg != "", msg
+
+
+class MyTransform(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        return x + 100
+
+    @staticmethod
+    def backward(ctx, grad):
+        return grad + 100
+
+
+def context_parallel_attention(query, key, value):
+    out = F.scaled_dot_product_attention(
+        query=query, key=key, value=value, is_causal=False
+    )
+    return out
+
+
+# NOTE: we use this function directly in the node checks
+def save_scalar_muls(ctx, op, *args, **kwargs):
+    if op == torch.ops.aten.mul.Scalar:
+        return torch.utils.checkpoint.CheckpointPolicy.MUST_SAVE
+    return torch.utils.checkpoint.CheckpointPolicy.MUST_RECOMPUTE
+
+
+def create_model(attention_fn, nheads, dim1, dim2, sac_policy=None):
+    class LocalMapTransformerBlock(nn.Module):
+        def __init__(self, nheads, dim1, dim2):
+            super().__init__()
+            self.nheads = nheads
+            bias = False
+            self.wq = nn.Linear(dim1, dim1, bias=bias)
+            self.wk = nn.Linear(dim1, dim1, bias=bias)
+            self.wv = nn.Linear(dim1, dim1, bias=bias)
+            self.wo = nn.Linear(dim1, dim1, bias=bias)
+            self.w1 = nn.Linear(dim1, dim2, bias=bias)
+            self.w2 = nn.Linear(dim2, dim1, bias=bias)
+            if sac_policy:
+                self.sac_context_fn = functools.partial(
+                    create_selective_checkpoint_contexts, sac_policy
+                )
+            else:
+                self.sac_context_fn = None
+
+        def _forward(self, x):
+            q = self.wq(x)
+            k = self.wk(x)
+            v = self.wv(x)
+
+            q = q.unflatten(-1, (self.nheads, -1)).permute(0, 2, 1, 3)
+            k = k.unflatten(-1, (self.nheads, -1)).permute(0, 2, 1, 3)
+            v = v.unflatten(-1, (self.nheads, -1)).permute(0, 2, 1, 3)
+
+            o = attention_fn(q, k, v)
+            o = o.permute(0, 2, 1, 3).flatten(-2)
+
+            o = self.wo(o)
+
+            o0 = o + x
+
+            o = self.w1(o0)
+            o = torch.nn.functional.relu(o)
+            o = self.w2(o)
+
+            o = o0 + o
+            return o
+
+        def forward(self, x):
+            if self.sac_context_fn is not None:
+                return torch.utils.checkpoint.checkpoint(
+                    self._forward,
+                    x,
+                    use_reentrant=False,
+                    context_fn=self.sac_context_fn,
+                )
+            return self._forward(x)
+
+    return LocalMapTransformerBlock(nheads, dim1, dim2)
+
+
+def get_local_mapped_functions():
+    assert torch.distributed.is_available()
+
+    @local_map(
+        out_placements=((Shard(0), Shard(1), Shard(2)),),
+        in_placements=(
+            (Shard(0), Shard(1), Shard(2)),  # query
+            (Shard(0), Shard(1), Replicate()),  # key
+            (Shard(0), Shard(1), Replicate()),  # value
+        ),
+        redistribute_inputs=True,
+        in_grad_placements=None,
+        device_mesh=None,
+    )
+    def cp_decorated(query, key, value):
+        return context_parallel_attention(query, key, value)
+
+    cp_function = local_map(
+        context_parallel_attention,
+        out_placements=(Shard(0), Shard(1), Shard(2)),
+        in_placements=(
+            (Shard(0), Shard(1), Shard(2)),  # query
+            (Shard(0), Shard(1), Replicate()),  # key
+            (Shard(0), Shard(1), Replicate()),  # value
+        ),
+        redistribute_inputs=True,
+        in_grad_placements=None,
+        device_mesh=None,
+    )
+
+    return cp_decorated, cp_function
+
+
+class TestLocalMap(TestCase):
+    def setUp(self):
+        self.exit_stack = contextlib.ExitStack()
+        self.exit_stack.enter_context(sdpa_kernel(backends=[SDPBackend.MATH]))
+
+    def tearDown(self):
+        self.exit_stack.close()
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_simple(self):
+        cp_decorated, cp_function = get_local_mapped_functions()
+        bs = 8 * 1
+        dim1 = 96
+        dim2 = dim1 * 4
+        nheads = 16
+        seq_len = 16
+
+        from torch._dynamo.testing import EagerAndRecordGraphs, normalize_gm
+
+        backend = EagerAndRecordGraphs()
+
+        model = create_model(cp_decorated, nheads, dim1, dim2)
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
+        with LocalMapWrappedHigherOrderVariable.enable():
+            out = torch.compile(model, backend=backend)(*inputs)
+        out.sum().backward()
+
+        model = create_model(cp_function, nheads, dim1, dim2)
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
+        with LocalMapWrappedHigherOrderVariable.enable():
+            out = torch.compile(model, backend=backend)(*inputs)
+        out.sum().backward()
+
+        if not TEST_WITH_CROSSREF:
+            self.assertEqual(len(backend.graphs), 2)
+            self.assertEqual(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.graphs[1].print_readable(print_output=False)),
+            )
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_self_modules_wq_parameters_weight_: "f32[96, 96]", L_x_: "f32[8, 16, 96]", L_self_modules_wk_parameters_weight_: "f32[96, 96]", L_self_modules_wv_parameters_weight_: "f32[96, 96]", L_self_modules_wo_parameters_weight_: "f32[96, 96]", L_self_modules_w1_parameters_weight_: "f32[384, 96]", L_self_modules_w2_parameters_weight_: "f32[96, 384]"):
+        l_self_modules_wq_parameters_weight_ = L_self_modules_wq_parameters_weight_
+        l_x_ = L_x_
+        l_self_modules_wk_parameters_weight_ = L_self_modules_wk_parameters_weight_
+        l_self_modules_wv_parameters_weight_ = L_self_modules_wv_parameters_weight_
+        l_self_modules_wo_parameters_weight_ = L_self_modules_wo_parameters_weight_
+        l_self_modules_w1_parameters_weight_ = L_self_modules_w1_parameters_weight_
+        l_self_modules_w2_parameters_weight_ = L_self_modules_w2_parameters_weight_
+
+        q: "f32[8, 16, 96]" = torch._C._nn.linear(l_x_, l_self_modules_wq_parameters_weight_, None);  l_self_modules_wq_parameters_weight_ = None
+
+        k: "f32[8, 16, 96]" = torch._C._nn.linear(l_x_, l_self_modules_wk_parameters_weight_, None);  l_self_modules_wk_parameters_weight_ = None
+
+        v: "f32[8, 16, 96]" = torch._C._nn.linear(l_x_, l_self_modules_wv_parameters_weight_, None);  l_self_modules_wv_parameters_weight_ = None
+
+        unflatten: "f32[8, 16, 16, 6]" = q.unflatten(-1, (16, -1));  q = None
+        q_1: "f32[8, 16, 16, 6]" = unflatten.permute(0, 2, 1, 3);  unflatten = None
+
+        unflatten_1: "f32[8, 16, 16, 6]" = k.unflatten(-1, (16, -1));  k = None
+        k_1: "f32[8, 16, 16, 6]" = unflatten_1.permute(0, 2, 1, 3);  unflatten_1 = None
+
+        unflatten_2: "f32[8, 16, 16, 6]" = v.unflatten(-1, (16, -1));  v = None
+        v_1: "f32[8, 16, 16, 6]" = unflatten_2.permute(0, 2, 1, 3);  unflatten_2 = None
+
+        subgraph_0 = self.subgraph_0
+        local_map_hop = torch.ops.higher_order.local_map_hop(subgraph_0, q_1, k_1, v_1);  subgraph_0 = q_1 = k_1 = v_1 = None
+        o: "f32[8, 16, 16, 6]" = local_map_hop[0];  local_map_hop = None
+
+        permute_3: "f32[8, 16, 16, 6]" = o.permute(0, 2, 1, 3);  o = None
+        o_1: "f32[8, 16, 96]" = permute_3.flatten(-2);  permute_3 = None
+
+        o_2: "f32[8, 16, 96]" = torch._C._nn.linear(o_1, l_self_modules_wo_parameters_weight_, None);  o_1 = l_self_modules_wo_parameters_weight_ = None
+
+        o0: "f32[8, 16, 96]" = o_2 + l_x_;  o_2 = l_x_ = None
+
+        o_3: "f32[8, 16, 384]" = torch._C._nn.linear(o0, l_self_modules_w1_parameters_weight_, None);  l_self_modules_w1_parameters_weight_ = None
+
+        o_4: "f32[8, 16, 384]" = torch.nn.functional.relu(o_3);  o_3 = None
+
+        o_5: "f32[8, 16, 96]" = torch._C._nn.linear(o_4, l_self_modules_w2_parameters_weight_, None);  o_4 = l_self_modules_w2_parameters_weight_ = None
+
+        o_6: "f32[8, 16, 96]" = o0 + o_5;  o0 = o_5 = None
+        return (o_6,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, q_1: "f32[8, 16, 16, 6]", k_1: "f32[8, 16, 16, 6]", v_1: "f32[8, 16, 16, 6]"):
+            out: "f32[8, 16, 16, 6]" = torch._C._nn.scaled_dot_product_attention(query = q_1, key = k_1, value = v_1, is_causal = False);  q_1 = k_1 = v_1 = None
+            return (out,)
+""",
+            )
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_sac(self):
+        cp_decorated, cp_function = get_local_mapped_functions()
+        bs = 8 * 1
+        dim1 = 96
+        dim2 = dim1 * 4
+        nheads = 16
+        seq_len = 16
+
+        from torch._dynamo.testing import AotEagerAndRecordGraphs, normalize_gm
+
+        backend = AotEagerAndRecordGraphs()
+
+        model = create_model(
+            cp_decorated, nheads, dim1, dim2, sac_policy=save_scalar_muls
+        )
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
+        with LocalMapWrappedHigherOrderVariable.enable():
+            out = torch.compile(model, backend=backend)(*inputs)
+        out.sum().backward()
+
+        model = create_model(
+            cp_function, nheads, dim1, dim2, sac_policy=save_scalar_muls
+        )
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
+        with LocalMapWrappedHigherOrderVariable.enable():
+            out = torch.compile(model, backend=backend)(*inputs)
+        out.sum().backward()
+
+        if not TEST_WITH_CROSSREF:
+            self.assertEqual(len(backend.graphs), 2)
+            self.assertEqual(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.graphs[1].print_readable(print_output=False)),
+            )
+            self.assertEqual(
+                normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.fw_graphs[1].print_readable(print_output=False)),
+            )
+            self.assertEqual(
+                normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.bw_graphs[1].print_readable(print_output=False)),
+            )
+            self.assertEqual(
+                len(
+                    backend.graphs[0].graph.find_nodes(
+                        op="call_function",
+                        target=torch._higher_order_ops.wrap.tag_activation_checkpoint,
+                    )
+                ),
+                1,
+            )
+            # TODO: add joint to the testing compile backend
+            fw_outs = {
+                n.name
+                for n in backend.fw_graphs[0].graph.find_nodes(op="output")[0].args[0]
+            }
+            bw_ins = {
+                n.name for n in backend.bw_graphs[0].graph.find_nodes(op="placeholder")
+            }
+            for node in backend.fw_graphs[0].graph.nodes:
+                if "recompute" in node.meta:
+                    expected = save_scalar_muls(None, node.target, None, None)
+                    actual = node.meta["recompute"]
+                    self.assertEqual(expected, actual)
+                    if actual == torch.utils.checkpoint.CheckpointPolicy.MUST_SAVE:
+                        self.assertTrue(node.name in fw_outs and node.name in bw_ins)
+                    elif (
+                        actual == torch.utils.checkpoint.CheckpointPolicy.MUST_RECOMPUTE
+                    ):
+                        # can still be in fw_outs for post-graph bytecode
+                        self.assertFalse(node.name in bw_ins)
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_sac_deferred(self):
+        # This test is in a bit of a weird state, it needs compositional compile API
+        # so that we can defer inlining for up until AOTAutograd stage 1.
+        # Then we should be inlined by stage 2. But we can't do that today.
+
+        cp_decorated, cp_function = get_local_mapped_functions()
+        bs = 8 * 1
+        dim1 = 96
+        dim2 = dim1 * 4
+        nheads = 16
+        seq_len = 16
+
+        from torch._dynamo.testing import AotEagerAndRecordGraphs, normalize_gm
+
+        backend = AotEagerAndRecordGraphs()
+
+        model = create_model(
+            cp_decorated, nheads, dim1, dim2, sac_policy=save_scalar_muls
+        )
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
+        try:
+            with (
+                LocalMapWrappedHigherOrderVariable.enable(),
+                torch._higher_order_ops.local_map.defer_inlining(),
+            ):
+                out = torch.compile(model, backend=backend)(*inputs)
+            out.sum().backward()
+        except AttributeError as e:
+            # TODO: get rid of this when we can install as a subgraph
+            self.assertTrue(
+                "module 'torch._higher_order_ops.local_map' has no attribute 'call_local_map'"
+                in str(e)
+            )
+
+        model = create_model(
+            cp_function, nheads, dim1, dim2, sac_policy=save_scalar_muls
+        )
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
+        try:
+            with (
+                LocalMapWrappedHigherOrderVariable.enable(),
+                torch._higher_order_ops.local_map.defer_inlining(),
+            ):
+                out = torch.compile(model, backend=backend)(*inputs)
+            out.sum().backward()
+        except AttributeError as e:
+            # TODO: get rid of this when we can install as a subgraph
+            self.assertTrue(
+                "module 'torch._higher_order_ops.local_map' has no attribute 'call_local_map'"
+                in str(e)
+            )
+
+        # TODO: re-enable tests on backward when we can install as a subgraph
+        if not TEST_WITH_CROSSREF:
+            self.assertEqual(len(backend.graphs), 2)
+            self.assertEqual(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.graphs[1].print_readable(print_output=False)),
+            )
+            self.assertEqual(
+                normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.fw_graphs[1].print_readable(print_output=False)),
+            )
+            # self.assertEqual(
+            #     normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+            #     normalize_gm(backend.bw_graphs[1].print_readable(print_output=False)),
+            # )
+            self.assertEqual(
+                len(
+                    backend.graphs[0].graph.find_nodes(
+                        op="call_function",
+                        target=torch._higher_order_ops.wrap.tag_activation_checkpoint,
+                    )
+                ),
+                1,
+            )
+            # TODO: add joint to the testing compile backend
+            fw_outs = {
+                n.name
+                for n in backend.fw_graphs[0].graph.find_nodes(op="output")[0].args[0]
+            }
+            # bw_ins = {
+            #     n.name for n in backend.bw_graphs[0].graph.find_nodes(op="placeholder")
+            # }
+            for node in backend.fw_graphs[0].graph.nodes:
+                if "recompute" in node.meta:
+                    expected = save_scalar_muls(None, node.target, None, None)
+                    actual = node.meta["recompute"]
+                    self.assertEqual(expected, actual)
+                    if actual == torch.utils.checkpoint.CheckpointPolicy.MUST_SAVE:
+                        self.assertTrue(node.name in fw_outs)
+                    #     self.assertTrue(node.name in fw_outs and node.name in bw_ins)
+                    # elif (
+                    #     actual == torch.utils.checkpoint.CheckpointPolicy.MUST_RECOMPUTE
+                    # ):
+                    #     # can still be in fw_outs for post-graph bytecode
+                    #     self.assertFalse(node.name in bw_ins)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 2788faf33d4cd..40f764c755bf6 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -2,6 +2,8 @@
 import itertools
 import logging
 import os
+import pathlib
+import subprocess
 import sys
 import tempfile
 import unittest
@@ -34,7 +36,7 @@
 from torch._utils_internal import full_aoti_runtime_assert
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
-from torch.export import Dim, export, export_for_training
+from torch.export import Dim, export
 from torch.export.pt2_archive._package import load_pt2
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
@@ -48,6 +50,7 @@
 )
 from torch.testing._internal.common_device_type import (
     _has_sufficient_memory,
+    e4m3_type,
     skipCUDAIf,
 )
 from torch.testing._internal.common_quantization import (
@@ -1192,7 +1195,6 @@ def forward(self, x, y):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
-    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
     @skipIfXpu
     def test_fp8(self):
         # cuda only
@@ -1205,7 +1207,7 @@ def __init__(self, dtype):
                 self.out_dtype = dtype
 
             def forward(self, x, weight, bias, scale_a, scale_b):
-                weight = weight.to(torch.float8_e4m3fn)
+                weight = weight.to(e4m3_type)
                 output = torch._scaled_mm(
                     x,
                     weight,
@@ -1227,7 +1229,7 @@ def forward(self, x, weight, bias, scale_a, scale_b):
         b_inverse_scale = 1 / b_scale
 
         x_shape = (16, 16)
-        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(torch.float8_e4m3fn)
+        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(e4m3_type)
         dim0_x = Dim("dim0_x", min=1, max=2048)
         dynamic_shapes = ({0: dim0_x}, None, None, None, None)
         self.check_model(
@@ -1240,7 +1242,6 @@ def forward(self, x, weight, bias, scale_a, scale_b):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
-    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
     @skipIfXpu
     def test_fp8_view_of_param(self):
         # cuda only
@@ -1275,15 +1276,13 @@ def forward(self, x, bias, scale_a, scale_b):
         input_bias = torch.rand(32, device=self.device, dtype=dtype)
         weight_shape = (32, 16)
         weight = torch.rand(*weight_shape, device=self.device, dtype=dtype).to(
-            torch.float8_e4m3fn
+            e4m3_type
         )
         a_inverse_scale = 1 / a_scale
         b_inverse_scale = 1 / b_scale
 
         x_shape = (16, 16)
-        x = torch.rand(*x_shape, device=self.device, dtype=dtype).to(
-            torch.float8_e4m3fn
-        )
+        x = torch.rand(*x_shape, device=self.device, dtype=dtype).to(e4m3_type)
         dim0_x = Dim("dim0_x", min=1, max=2048)
         dynamic_shapes = ({0: dim0_x}, None, None, None)
         self.check_model(
@@ -1830,7 +1829,7 @@ def __init__(self, user_float_feature_idx, device):
                 self.user_float_feature_idx = user_float_feature_idx
                 self.register_buffer(
                     "_tensor_constant0",
-                    torch.ones(1, device=device, dtype=torch.float32),
+                    torch.ones(5, device=device, dtype=torch.float32),
                     persistent=True,
                 )
                 self.register_buffer(
@@ -1841,6 +1840,7 @@ def __init__(self, user_float_feature_idx, device):
                 self.sub_mod = SubModule(device)
 
             def forward(self, x):
+                self._tensor_constant0[1:2] = 1
                 return (
                     torch.index_select(
                         x, 1, torch.tensor(self.user_float_feature_idx, device=x.device)
@@ -2525,9 +2525,7 @@ def forward(self, x):
             config.patch({"freezing": True, "aot_inductor.force_mmap_weights": True}),
             torch.no_grad(),
         ):
-            exported_model = export_for_training(
-                model, example_inputs, strict=True
-            ).module()
+            exported_model = export(model, example_inputs, strict=True).module()
             quantizer = X86InductorQuantizer()
             quantizer.set_global(
                 xiq.get_default_x86_inductor_quantization_config(reduce_range=True)
@@ -5195,7 +5193,6 @@ def forward(self, x):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
-    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
     @skipIfXpu
     def test_aoti_debug_printer_fp8_dtype(self):
         if self.device != GPU_TYPE:
@@ -5207,7 +5204,7 @@ def __init__(self, dtype):
                 self.out_dtype = dtype
 
             def forward(self, x, weight, bias, scale_a, scale_b):
-                weight = weight.to(torch.float8_e4m3fn)
+                weight = weight.to(e4m3_type)
                 output = torch._scaled_mm(
                     x,
                     weight,
@@ -5229,7 +5226,7 @@ def forward(self, x, weight, bias, scale_a, scale_b):
         b_inverse_scale = 1 / b_scale
 
         x_shape = (16, 16)
-        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(torch.float8_e4m3fn)
+        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(e4m3_type)
 
         kernel_calls = [
             (f"aoti_torch_{GPU_TYPE}__scaled_mm_out", 5),
@@ -7093,6 +7090,64 @@ def forward(self, x):
             "RAIIAtenTensorHandle buf0(buf0_handle_restrided);"
         ).run(code)
 
+    @unittest.skipIf(IS_MACOS, "might have no readelf on Mac")
+    def test_libtorch_free_so(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+
+            def forward(self, x, y):
+                return x + self.linear(y)
+
+        example_inputs = (
+            torch.randn(10, 10, device=self.device),
+            torch.randn(10, 10, device=self.device),
+        )
+
+        model = Model().to(self.device)
+        ep = torch.export.export(model, example_inputs)
+
+        package_path = torch._inductor.aoti_compile_and_package(
+            ep,
+            inductor_configs={
+                "aot_inductor.link_libtorch": False,
+            },
+        )
+
+        torch_libs = {
+            "libtorch.so",
+            "libc10.so",
+            "libtorch_cuda.so",
+            "libc10_cuda.so",
+            "libtorch_cpu.so",
+            "libtorch_xpu.so",
+            "libc10_xpu.so",
+        }
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Unpack
+            with zipfile.ZipFile(package_path, "r") as zf:
+                zf.extractall(tmpdir)
+
+            so_files = list(pathlib.Path(tmpdir).rglob("*.so"))
+            self.assertTrue(len(so_files) > 0)
+
+            for so_file in so_files:
+                so_copy = pathlib.Path(tmpdir) / f"{so_file.name}.checkcopy"
+                so_copy.write_bytes(so_file.read_bytes())
+
+                result = subprocess.run(
+                    ["readelf", "-d", str(so_copy)],
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                for line in result.stdout.splitlines():
+                    if "NEEDED" in line:
+                        for lib in torch_libs:
+                            self.assertTrue(lib not in line)
+
 
 class AOTInductorLoggingTest(LoggingTestCase):
     @make_logging_test(dynamic=logging.DEBUG)
@@ -7219,11 +7274,6 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
 }
 
 MPS_TEST_FAILURES = {
-    # aten::_embedding_bag is not currently implemented for the MPS device.
-    "test_embedding_bag": fail_mps(),
-    # aten::_embedding_bag is not currently implemented for the MPS device.
-    "test_misc_1_max_autotune_False": fail_mps(),
-    "test_misc_1_max_autotune_True": fail_mps(),
     # aten::_scaled_dot_product_efficient_attention is not currently implemented for the MPS device.
     "test_scaled_dot_product_efficient_attention": fail_mps(),
     # aten::_int_mm is not implemented for MPS backend
@@ -7240,8 +7290,6 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     "test_index_put_with_none_index": fail_mps(),
     # Error device may not be nil
     "test_zero_size_weight": fail_mps(is_skip=True),
-    # RuntimeError: Cannot compare two tensors on different devices. Got: cpu and mps:0
-    "test_aoti_constant_tensor_name_collision": fail_mps(is_skip=True),
     # MPSGraph does not support tensor dims > INT_MAX
     "test_upper_bound_i64": fail_mps(is_skip=True),
     # MPS doesn't support triton
diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
index 0eb1057c802eb..933403902eed0 100644
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@@ -16,7 +16,7 @@
 
 import torch
 import torch._inductor.config
-from torch._inductor.codecache import get_kernel_bin_format
+from torch._inductor.codecache import get_kernel_bin_format, WritableTempFile
 from torch._inductor.package import load_package, package_aoti
 from torch._inductor.test_case import TestCase
 from torch._inductor.utils import fresh_cache
@@ -119,7 +119,7 @@ def check_model(
             inductor_configs["aot_inductor.package_cpp_only"] = self.package_cpp_only
 
             torch.manual_seed(0)
-            with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+            with WritableTempFile(suffix=".pt2") as f:
                 compiled_model = compile(
                     model,
                     example_inputs,
@@ -242,7 +242,7 @@ def forward(self, x, y):
             expected = ref_model(*ref_inputs)
 
             torch.manual_seed(0)
-            with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+            with WritableTempFile(suffix=".pt2") as f:
                 ep = torch.export.export(model, example_inputs, strict=True)
                 with fresh_cache():
                     # cubin files are removed when exiting this context
@@ -644,7 +644,7 @@ def forward(self, x):
             ep2.module(), example_inputs2, options=options
         )
 
-        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+        with WritableTempFile(suffix=".pt2") as f:
             package_path = package_aoti(
                 f.name, {"model1": aoti_files1, "model2": aoti_files2}
             )
@@ -696,7 +696,7 @@ def forward(self, a, b):
             ep2.module(), example_inputs2, options=options
         )
 
-        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+        with WritableTempFile(suffix=".pt2") as f:
             package_path = package_aoti(
                 f.name, {"model1": aoti_files1, "model2": aoti_files2}
             )
@@ -732,7 +732,7 @@ def forward(self, a, b):
                 "aot_inductor.package_cpp_only": self.package_cpp_only,
             },
         )
-        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+        with WritableTempFile(suffix=".pt2") as f:
             package_path = package_aoti(f.name, {"model1": aoti_files})
             loaded = load_package(package_path, "model1")
         self.assertTrue(
@@ -952,7 +952,7 @@ def forward(self):
         aoti_files1 = torch._inductor.aot_compile(ep1.module(), (), options=options)
         aoti_files2 = torch._inductor.aot_compile(ep2.module(), (), options=options)
 
-        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+        with WritableTempFile(suffix=".pt2") as f:
             package_path = package_aoti(
                 f.name,
                 {"model1": aoti_files1, "model2": aoti_files2},
diff --git a/test/inductor/test_cache.py b/test/inductor/test_cache.py
new file mode 100644
index 0000000000000..18b714c7075c6
--- /dev/null
+++ b/test/inductor/test_cache.py
@@ -0,0 +1,814 @@
+# Owner(s): ["module: inductor"]
+from __future__ import annotations
+
+import pickle
+from concurrent.futures import ThreadPoolExecutor
+from inspect import isclass
+from os import environ
+from pathlib import Path
+from random import randint
+from tempfile import gettempdir
+from typing import Any, Callable, Sequence
+from typing_extensions import Self
+from unittest.mock import patch
+
+from torch._inductor import cache as icache
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+
+
+class TestMixin:
+    @staticmethod
+    def abstract_cache_types() -> set[type[icache.Cache]]:
+        return {icache.Cache, icache.AsyncCache}
+
+    @staticmethod
+    def cache_types() -> Sequence[type[icache.Cache]]:
+        cache_types: list[type[icache.Cache]] = []
+
+        for obj_name in dir(icache):
+            obj = getattr(icache, obj_name)
+
+            if not isclass(obj) or not issubclass(obj, icache.Cache):
+                continue
+
+            if obj in TestMixin.abstract_cache_types():
+                continue
+
+            cache_types.append(obj)
+        return cache_types
+
+    @staticmethod
+    def async_cache_types() -> Sequence[type[icache.AsyncCache]]:
+        return [
+            cache_type
+            for cache_type in TestMixin.cache_types()
+            if issubclass(cache_type, icache.AsyncCache)
+        ]
+
+    @staticmethod
+    def on_disk_cache_types() -> Sequence[type[icache.OnDiskCache]]:
+        return [
+            cache_type
+            for cache_type in TestMixin.cache_types()
+            if issubclass(cache_type, icache.OnDiskCache)
+        ]
+
+    @staticmethod
+    def key_types() -> Sequence[type[icache.Key]]:
+        return [*icache.Key.__constraints__]
+
+    @staticmethod
+    def value_types() -> Sequence[type[icache.Value]]:
+        return [*icache.Value.__constraints__]
+
+    @staticmethod
+    def cache_type_supports_key_and_value_types(
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> bool:
+        assert len(cache_type.__orig_bases__) == 1
+        generic_base = cache_type.__orig_bases__[0]
+        _key_type, _value_type = generic_base.__args__
+        if ((_key_type != icache.Key) and (_key_type != key_type)) or (
+            (_value_type != icache.Value) and (_value_type != value_type)
+        ):
+            return False
+        return True
+
+    def key_not_in(
+        self: Self,
+        cache: icache.Cache[icache.Key, icache.Value],
+        key_fn: Callable[[], icache.Key],
+    ) -> icache.Key:
+        while cache.get(key := key_fn()) is not None:
+            continue
+        return key
+
+    def keys_not_in(
+        self: Self,
+        cache: icache.Cache[icache.Key, icache.Value],
+        key_fn: Callable[[], icache.Key],
+        num: int,
+    ) -> list[icache.Key]:
+        keys = []
+        while len(keys) < num:
+            if (key := self.key_not_in(cache, key_fn)) not in keys:
+                keys.append(key)
+        return keys
+
+    def key(self: Self, key_type: type[icache.Key]) -> icache.Key:
+        if key_type == str:
+            return f"s{randint(0, 2**32)}"
+        elif key_type == int:
+            return randint(0, 2**32)
+        elif key_type == tuple[Any, ...]:
+            return (self.key(str), self.key(int))
+        else:
+            raise NotImplementedError
+
+    def values_unalike(
+        self: Self, value_fn: Callable[[], icache.Value], num: int
+    ) -> list[icache.Value]:
+        values = []
+        while len(values) < num:
+            if (value := value_fn()) not in values:
+                values.append(value)
+        return values
+
+    def value(self: Self, value_type: type[icache.Value]) -> icache.Value:
+        if value_type == str:
+            return f"s{randint(0, 2**32)}"
+        elif value_type == int:
+            return randint(0, 2**32)
+        elif value_type == tuple[Any, ...]:
+            return (self.value(str), self.value(int))
+        elif value_type == bytes:
+            return self.value(str).encode()
+        elif value_type == dict[Any, Any]:
+            return {
+                "zero": self.value(str),
+                1: self.value(int),
+                (2): self.value(tuple[Any, ...]),
+                b"three": self.value(bytes),
+            }
+        elif value_type == list[Any]:
+            return [self.value(str), self.value(int), self.value(dict[Any, Any])]
+        else:
+            raise NotImplementedError
+
+    def maybe_randomize_base_dir(self: Self, cache: icache.Cache) -> None:
+        # multi on disk caches might exist at any time, and the tests
+        # assume they are isolated so we should randomize their base dir
+        if isinstance(cache, icache.OnDiskCache):
+            cache.base_dir = cache.base_dir / f"{hash(cache)}"
+
+
+@instantiate_parametrized_tests
+class CacheTest(TestMixin, TestCase):
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_get(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Checks that a cache returns None for missing keys, and after insertion,
+        # returns the correct value for each key.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        key_1, key_2 = self.keys_not_in(cache, lambda: self.key(key_type), 2)
+        value_1, value_2 = self.values_unalike(lambda: self.value(value_type), 2)
+
+        self.assertIsNone(cache.get(key_1))
+        self.assertIsNone(cache.get(key_2))
+
+        self.assertTrue(cache.insert(key_1, value_1))
+        self.assertTrue(cache.insert(key_2, value_2))
+
+        self.assertEqual(cache.get(key_1), value_1)
+        self.assertEqual(cache.get(key_2), value_2)
+
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_insert(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Verifies that inserting a new key succeeds, inserting the same key again fails,
+        # and the value for the key remains the first inserted value.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        key = self.key_not_in(cache, lambda: self.key(key_type))
+        value_1, value_2 = self.values_unalike(lambda: self.value(value_type), 2)
+
+        self.assertIsNone(cache.get(key))
+
+        self.assertTrue(cache.insert(key, value_1))
+        self.assertFalse(cache.insert(key, value_2))
+
+        self.assertEqual(cache.get(key), value_1)
+
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_get_concurrent(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that concurrent reads (get) from the cache return the correct values
+        # for all inserted keys, even under parallel access.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 100
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        keys = self.keys_not_in(cache, lambda: self.key(key_type), iters)
+        values = self.values_unalike(lambda: self.value(value_type), iters)
+
+        for key, value in zip(keys, values):
+            self.assertIsNone(cache.get(key))
+            self.assertTrue(cache.insert(key, value))
+
+        gets = executor.map(cache.get, keys)
+        for value, get in zip(values, gets):
+            self.assertEqual(get, value)
+
+        executor.shutdown()
+
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_insert_concurrent(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that concurrent inserts work as expected: only the first insert for each key
+        # succeeds, and the cache contains the correct value for each key after all inserts.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 50
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        keys = self.keys_not_in(cache, lambda: self.key(key_type), iters) * 2
+        values = self.values_unalike(lambda: self.value(value_type), iters * 2)
+
+        for key in keys:
+            self.assertIsNone(cache.get(key))
+
+        inserts = executor.map(cache.insert, keys, values)
+        inserted = {}
+        for key, value, insert in zip(keys, values, inserts):
+            if insert:
+                self.assertEqual(cache.get(key), value)
+                self.assertTrue(key not in inserted)
+                inserted[key] = value
+
+        self.assertTrue(set(keys) == set(inserted.keys()))
+        for key, value in inserted.items():
+            self.assertEqual(cache.get(key), value)
+
+        executor.shutdown()
+
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    @parametrize("get_first", [True, False])
+    def test_combo_concurrent(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+        get_first: bool,
+    ) -> None:
+        # Tests a mix of concurrent get and insert operations, with the order of operations
+        # varied by the get_first parameter, to ensure correctness under interleaved access.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 50
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        keys = self.keys_not_in(cache, lambda: self.key(key_type), iters) * 2
+        values = self.values_unalike(lambda: self.value(value_type), iters * 2)
+
+        for key in keys:
+            self.assertIsNone(cache.get(key))
+
+        get_futures, insert_futures = [], []
+        for key, value in zip(keys, values):
+            if get_first:
+                get_futures.append(executor.submit(cache.get, key))
+                insert_futures.append(executor.submit(cache.insert, key, value))
+            else:
+                insert_futures.append(executor.submit(cache.insert, key, value))
+                get_futures.append(executor.submit(cache.get, key))
+
+        inserted = {}
+        for key, value, get_future, insert_future in zip(
+            keys, values, get_futures, insert_futures
+        ):
+            if (get := get_future.result()) is not None:
+                if insert_future.result():
+                    self.assertEqual(get, value)
+                    self.assertTrue(key not in inserted)
+                    inserted[key] = value
+            else:
+                if insert_future.result():
+                    self.assertTrue(key not in inserted)
+                    inserted[key] = value
+
+        self.assertTrue(set(keys) == set(inserted.keys()))
+        for key, value in inserted.items():
+            self.assertEqual(cache.get(key), value)
+
+        executor.shutdown()
+
+
+@instantiate_parametrized_tests
+class AsyncCacheTest(TestMixin, TestCase):
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_get_async(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Verifies that asynchronous get and insert operations work as expected:
+        # get_async returns None for missing keys, insert_async inserts values,
+        # and get_async returns the correct value after insertion.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        key_1, key_2 = self.keys_not_in(async_cache, lambda: self.key(key_type), 2)
+        value_1, value_2 = self.values_unalike(lambda: self.value(value_type), 2)
+
+        executor = ThreadPoolExecutor()
+
+        get_1 = async_cache.get_async(key_1, executor)
+        get_2 = async_cache.get_async(key_2, executor)
+        self.assertIsNone(get_1.result())
+        self.assertIsNone(get_2.result())
+
+        insert_1 = async_cache.insert_async(key_1, value_1, executor)
+        insert_2 = async_cache.insert_async(key_2, value_2, executor)
+        self.assertTrue(insert_1.result())
+        self.assertTrue(insert_2.result())
+
+        get_1 = async_cache.get_async(key_1, executor)
+        get_2 = async_cache.get_async(key_2, executor)
+        self.assertEqual(get_1.result(), value_1)
+        self.assertEqual(get_2.result(), value_2)
+
+        executor.shutdown()
+
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_insert_async(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that only one of two concurrent insert_async calls for the same key succeeds,
+        # and the cache contains the value from the successful insert.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        key = self.key_not_in(async_cache, lambda: self.key(key_type))
+        value_1, value_2 = self.values_unalike(lambda: self.value(value_type), 2)
+
+        executor = ThreadPoolExecutor()
+
+        get = async_cache.get_async(key, executor)
+        self.assertIsNone(get.result())
+
+        insert_1 = async_cache.insert_async(key, value_1, executor)
+        insert_2 = async_cache.insert_async(key, value_2, executor)
+        self.assertTrue(insert_1.result() ^ insert_2.result())
+
+        get = async_cache.get_async(key, executor)
+        if insert_1.result():
+            self.assertEqual(get.result(), value_1)
+        else:
+            self.assertEqual(get.result(), value_2)
+
+        executor.shutdown()
+
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_get_async_concurrent(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that concurrent asynchronous get operations return the correct values
+        # for all inserted keys.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 100
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        keys = self.keys_not_in(async_cache, lambda: self.key(key_type), iters)
+        values = self.values_unalike(lambda: self.value(value_type), iters)
+
+        for key, value in zip(keys, values):
+            self.assertIsNone(async_cache.get(key))
+            self.assertTrue(async_cache.insert(key, value))
+
+        gets = executor.map(lambda key: async_cache.get_async(key, executor), keys)
+        for value, get in zip(values, gets):
+            self.assertEqual(get.result(), value)
+
+        executor.shutdown()
+
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_insert_async_concurrent(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that concurrent asynchronous insert operations only allow the first insert
+        # for each key to succeed, and the cache contains the correct value for each key.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 50
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        keys = self.keys_not_in(async_cache, lambda: self.key(key_type), iters) * 2
+        values = self.values_unalike(lambda: self.value(value_type), iters * 2)
+
+        for key in keys:
+            self.assertIsNone(async_cache.get(key))
+
+        inserts = executor.map(
+            lambda key, value: async_cache.insert_async(key, value, executor),
+            keys,
+            values,
+        )
+        inserted = {}
+        for key, value, insert in zip(keys, values, inserts):
+            if insert.result():
+                self.assertEqual(async_cache.get(key), value)
+                self.assertTrue(key not in inserted)
+                inserted[key] = value
+
+        self.assertTrue(set(keys) == set(inserted.keys()))
+        for key, value in inserted.items():
+            self.assertTrue(async_cache.get(key), value)
+
+        executor.shutdown()
+
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    @parametrize("get_first", [True, False])
+    def test_combo_async_concurrent(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+        get_first: bool,
+    ) -> None:
+        # Tests a mix of concurrent asynchronous get and insert operations, with the order
+        # of operations varied by the get_first parameter, to ensure correctness under
+        # interleaved async access.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 50
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        keys = self.keys_not_in(async_cache, lambda: self.key(key_type), iters) * 2
+        values = self.values_unalike(lambda: self.value(value_type), iters * 2)
+
+        for key in keys:
+            self.assertIsNone(async_cache.get(key))
+
+        get_futures, insert_futures = [], []
+        for key, value in zip(keys, values):
+            if get_first:
+                get_futures.append(async_cache.get_async(key, executor))
+                insert_futures.append(async_cache.insert_async(key, value, executor))
+            else:
+                insert_futures.append(async_cache.insert_async(key, value, executor))
+                get_futures.append(async_cache.get_async(key, executor))
+
+        inserted = {}
+        for key, value, get_future, insert_future in zip(
+            keys, values, get_futures, insert_futures
+        ):
+            if (get := get_future.result()) is not None:
+                if insert_future.result():
+                    self.assertEqual(get, value)
+                    self.assertTrue(key not in inserted)
+                    inserted[key] = value
+            else:
+                if insert_future.result():
+                    self.assertTrue(key not in inserted)
+                    inserted[key] = value
+
+        self.assertTrue(set(keys) == set(inserted.keys()))
+        for key, value in inserted.items():
+            self.assertEqual(async_cache.get(key), value)
+
+        executor.shutdown()
+
+
+@instantiate_parametrized_tests
+class OtherTest(TestMixin, TestCase):
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    @parametrize("with_whitespace", [True, False])
+    @parametrize("with_semicolon_suffix", [True, False])
+    def test_in_memory_cache_from_env_var(
+        self: Self,
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+        with_whitespace: bool,
+        with_semicolon_suffix: bool,
+    ) -> None:
+        # Verifies that InMemoryCache.from_env_var correctly parses environment variables
+        # with various whitespace and semicolon suffixes, and loads all key-value pairs.
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        ws = "" if not with_whitespace else " "
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_TEST"
+        env_val = ";".join(
+            [
+                f"{ws}{pickle.dumps(key)!r}{ws},{ws}{pickle.dumps(value)!r}{ws}"
+                for key, value in zip(keys, values)
+            ]
+        ) + (";" if with_semicolon_suffix else "")
+        environ[env_var] = env_val
+
+        cache = icache.InMemoryCache.from_env_var(env_var)
+        for key, value in zip(keys, values):
+            self.assertEqual(cache.get(key), value)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_env_var_missing_comma_separator(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Ensures that InMemoryCache.from_env_var raises CacheError if the environment
+        # variable is missing the required comma separator between key and value.
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_MISSING_COMMA_SEPARATOR_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)!r}{pickle.dumps(value)!r}"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_env_var(env_var)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_env_var_bad_encoding(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Ensures that InMemoryCache.from_env_var raises CacheError if the key or value
+        # encoding in the environment variable is invalid (not a valid Python literal).
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_BAD_ENCODING_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)!r}/,{pickle.dumps(value)!r}/"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_env_var(env_var)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_env_var_not_un_pickle_able(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Ensures that InMemoryCache.from_env_var raises CacheError if the key or value
+        # cannot be unpickled (e.g., due to data corruption).
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_NOT_UN_PICKLE_ABLE_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)[::-1]!r},{pickle.dumps(value)[::-1]!r}"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_env_var(env_var)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_env_var_duplicated_entries(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Verifies that duplicate key-value pairs are allowed if the value is consistent,
+        # but raises CacheError if the same key appears with different values.
+
+        keys = (
+            self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3) * 2
+        )
+        values = self.values_unalike(lambda: self.value(value_type), 3) * 2
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_DUPLICATED_ENTRIES_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)!r},{pickle.dumps(value)!r}"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        # duplicate key => value entries are okay, as long as value is consistent
+        cache = icache.InMemoryCache.from_env_var(env_var)
+        for key, value in zip(keys, values):
+            self.assertEqual(cache.get(key), value)
+
+        keys = (
+            self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3) * 2
+        )
+        values = self.values_unalike(lambda: self.value(value_type), 6)
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_DUPLICATED_ENTRIES_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)!r},{pickle.dumps(value)!r}"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        # duplicate key => value entries with inconsistent values are not okay
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_env_var(env_var)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_file_path(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Checks that InMemoryCache.from_file_path correctly loads a cache from a file
+        # containing a pickled dictionary of key-value pairs.
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        cache = icache.InMemoryCache()
+
+        for key, value in zip(keys, values):
+            self.assertTrue(cache.insert(key, value))
+
+        fpath = Path(gettempdir()) / "IN_MEMORY_CACHE_FROM_FILE_PATH_TEST"
+        with open(fpath, "wb") as fp:
+            pickle.dump(cache._cache, fp)
+
+        from_file_path_cache = icache.InMemoryCache.from_file_path(fpath)
+
+        for key, value in zip(keys, values):
+            self.assertEqual(from_file_path_cache.get(key), value)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_file_path_not_un_pickle_able(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Ensures that InMemoryCache.from_file_path raises CacheError if the file contents
+        # cannot be unpickled (e.g., due to corruption).
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        cache = icache.InMemoryCache()
+
+        for key, value in zip(keys, values):
+            self.assertTrue(cache.insert(key, value))
+
+        fpath = (
+            Path(gettempdir())
+            / "IN_MEMORY_CACHE_FROM_FILE_PATH_NOT_UN_PICKLE_ABLE_TEST"
+        )
+        with open(fpath, "wb") as fp:
+            pickled_cache = pickle.dumps(cache._cache)
+            pickled_cache = pickled_cache[::-1]
+            fp.write(pickled_cache)
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_file_path(fpath)
+
+    def test_in_memory_cache_from_file_path_not_dict(self: Self) -> None:
+        # This test verifies that InMemoryCache.from_file_path raises a CacheError
+        # when the file does not contain a pickled dictionary. It writes a pickled
+        # list to a temporary file, then attempts to load it as a cache. The cache
+        # expects a dictionary structure; loading a non-dictionary should raise an error.
+        fpath = Path(gettempdir()) / "IN_MEMORY_CACHE_FROM_FILE_PATH_NOT_DICT_TEST"
+        with open(fpath, "wb") as fp:
+            pickled_cache = pickle.dumps([1, 2, 3])
+            fp.write(pickled_cache)
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_file_path(fpath)
+
+    @parametrize("on_disk_cache_type", TestMixin.on_disk_cache_types())
+    def test_on_disk_cache_fpath_from_key_un_pickle_able(
+        self: Self, on_disk_cache_type: type[icache.OnDiskCache]
+    ) -> None:
+        # This test checks that _fpath_from_key raises a CacheError when given a
+        # key that cannot be pickled. It passes a lambda function (which is not
+        # pickle-able) as the key. The cache uses pickling to serialize keys for
+        # file storage. If a key cannot be pickled, the cache should fail gracefully
+        # and raise a clear error.
+        cache: icache.OnDiskCache = on_disk_cache_type()
+        un_pickle_able_key = lambda: None  # noqa: E731
+
+        with self.assertRaises(icache.CacheError):
+            _ = cache._fpath_from_key(un_pickle_able_key)
+
+    @parametrize("on_disk_cache_type", TestMixin.on_disk_cache_types())
+    def test_on_disk_cache_version_bump(
+        self: Self, on_disk_cache_type: type[icache.OnDiskCache]
+    ) -> None:
+        # This test ensures that cache entries are invalidated when the cache version
+        # changes, and that new entries can be inserted and retrieved after a version bump.
+        # It inserts a key-value pair, then simulates a version bump by patching the cache
+        # version. After the version change, it verifies that the old entry is no longer
+        # retrievable (invalidated), and that a new entry can be inserted and retrieved.
+        # Versioning is used to invalidate stale cache entries when the cache format or
+        # logic changes.
+        cache: icache.OnDiskCache = on_disk_cache_type()
+        key = self.key_not_in(cache, lambda: self.key(str))
+        value = self.value(str)
+
+        self.assertIsNone(cache.get(key))
+        self.assertTrue(cache.insert(key, value))
+        self.assertEqual(cache.get(key), value)
+
+        old_version = icache.OnDiskCache.version
+        bump_version = old_version + 1
+        with patch.object(icache.OnDiskCache, "version", bump_version):
+            self.assertIsNone(cache.get(key))
+            self.assertTrue(cache.insert(key, value))
+            self.assertEqual(cache.get(key), value)
+
+        self.assertIsNone(cache.get(key))
+        self.assertTrue(cache.insert(key, value))
+        self.assertEqual(cache.get(key), value)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_ck_backend.py b/test/inductor/test_ck_backend.py
index f73a47e45a57a..079be79fcc9d8 100644
--- a/test/inductor/test_ck_backend.py
+++ b/test/inductor/test_ck_backend.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: inductor"]
-import functools
 import logging
 import os
 import unittest
@@ -13,6 +12,7 @@
 import torch
 from torch._inductor import config
 from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import try_import_ck_lib
 from torch.testing._internal.common_cuda import tf32_off
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -32,20 +32,8 @@
 log = logging.getLogger(__name__)
 
 
-@functools.lru_cache(None)
-def _get_path_without_sccache() -> str:
-    """
-    Get the PATH environment variable without sccache.
-    """
-    path_envs = os.environ.get("PATH", "").split(":")
-    path_envs = [env for env in path_envs if "/opt/cache/bin" not in env]
-    return ":".join(path_envs)
-
-
-_test_env = {
-    "PATH": _get_path_without_sccache(),
-    "DISABLE_SCCACHE": "1",
-}
+# patch env for tests if needed
+_test_env = {}
 
 
 @instantiate_parametrized_tests
@@ -61,13 +49,10 @@ def setUp(self):
         )
 
         torch.random.manual_seed(1234)
-        try:
-            import ck4inductor  # @manual
 
-            self.ck_dir = os.path.dirname(ck4inductor.__file__)
-            os.environ["TORCHINDUCTOR_CK_DIR"] = self.ck_dir
-        except ImportError as e:
-            raise unittest.SkipTest("Composable Kernel library not installed") from e
+        self.ck_dir, _, _, _ = try_import_ck_lib()
+        if not self.ck_dir:
+            raise unittest.SkipTest("Composable Kernel library is not installed")
 
         try:
             os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1"
@@ -288,6 +273,9 @@ def addmm(x, a, b, alpha, beta):
 
             torch.testing.assert_close(Y_compiled, Y_eager)
 
+    @unittest.skip(
+        "FIXME(tenpercent): kernel compilation errors on gfx942 as of 09/01/25"
+    )
     @unittest.skipIf(not torch.version.hip, "ROCM only")
     @unittest.mock.patch.dict(os.environ, _test_env)
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 6da49ab392290..09570b98a2fba 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -7,6 +7,7 @@
 import subprocess
 import sys
 import tempfile
+import textwrap
 import unittest
 from contextlib import contextmanager
 from typing import Optional, Union
@@ -56,6 +57,7 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_FBCODE,
+    IS_SANDCASTLE,
     parametrize,
     TEST_WITH_ROCM,
 )
@@ -138,6 +140,101 @@ def test_linemaps_empty(self):
         stack_frames = PyCodeCache.stack_frames_for_code(path, 0)
         self.assertEqual(stack_frames, None)
 
+    @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Skip in fbcode/sandcastle")
+    def test_editable_cached_wrapper(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            env = os.environ.copy()
+            env["TORCHINDUCTOR_CACHE_DIR"] = tmpdir
+
+            step1 = textwrap.dedent(
+                """
+                import glob
+                import os
+                import torch
+                import warnings
+                from torch._inductor import config
+
+                warnings.filterwarnings("ignore")
+                config.fx_graph_cache = True
+                config.fx_graph_remote_cache = False
+                torch._dynamo.reset()
+
+                @torch.compile(backend="inductor")
+                def f(x):
+                    return x * 2
+
+                f(torch.ones(2))
+                cache_dir = os.environ["TORCHINDUCTOR_CACHE_DIR"]
+                pyfiles = glob.glob(os.path.join(cache_dir, "**", "*.py"), recursive=True)
+                print(pyfiles[0])
+                """
+            )
+            wrapper_path = (
+                subprocess.check_output([sys.executable, "-c", step1], env=env)
+                .decode()
+                .strip()
+            )
+
+            step2 = textwrap.dedent(
+                """
+                import torch
+                import warnings
+                from torch._dynamo.utils import counters
+                from torch._inductor import config
+
+                warnings.filterwarnings("ignore")
+                config.fx_graph_cache = True
+                config.fx_graph_remote_cache = False
+                torch._dynamo.reset()
+
+                @torch.compile(backend="inductor")
+                def f(x):
+                    return x * 2
+
+                f(torch.ones(2))
+                print(counters["inductor"]["fxgraph_cache_hit"])
+                """
+            )
+            hit = (
+                subprocess.check_output([sys.executable, "-c", step2], env=env)
+                .decode()
+                .strip()
+            )
+            self.assertEqual(hit, "1")
+
+            with open(wrapper_path) as f:
+                src = f.read()
+            with open(wrapper_path, "w") as f:
+                f.write(
+                    src.replace(
+                        "def call(self, args):",
+                        "def call(self, args):\n        print('debug')",
+                    )
+                )
+
+            step3 = textwrap.dedent(
+                """
+                import torch
+                import warnings
+                from torch._inductor import config
+
+                warnings.filterwarnings("ignore")
+                config.fx_graph_cache = True
+                config.fx_graph_remote_cache = False
+                torch._dynamo.reset()
+
+                @torch.compile(backend="inductor")
+                def f(x):
+                    return x * 2
+
+                f(torch.ones(2))
+                """
+            )
+            out = subprocess.check_output(
+                [sys.executable, "-c", step3], env=env
+            ).decode()
+            self.assertIn("debug", out)
+
 
 @instantiate_parametrized_tests
 class TestFxGraphCache(TestCase):
diff --git a/test/inductor/test_combo_kernels.py b/test/inductor/test_combo_kernels.py
index 90399546d26ea..59187c7349a09 100644
--- a/test/inductor/test_combo_kernels.py
+++ b/test/inductor/test_combo_kernels.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch._inductor
+from torch._inductor.utils import run_and_get_code
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     TestCase,
@@ -554,6 +555,24 @@ def fn(x, y, z):
 
         self.assertEqual(out_eager, out_compiled)
 
+    @requires_cuda_and_triton
+    def test_helper_fn_defined(self):
+        def fn(x, y, z):
+            return x.sum(1), y.mean(1), z.cumsum(1)
+
+        inps = (
+            torch.rand(16, 128, device="cuda"),
+            torch.rand(32, 128, device="cuda"),
+            torch.rand(32, 256, device="cuda"),
+        )
+
+        out_eager = fn(*inps)
+        fn_c = torch.compile(fn)
+        out_compiled, code = run_and_get_code(fn_c, *inps)
+        code = " ".join(code)
+        self.assertEqual(out_eager, out_compiled)
+        self.assertEqual(code.count("def _triton_helper_fn_add0(arg0_0, arg1_0):"), 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 6014a6e698607..e0cd8b99a6b3d 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -5354,7 +5354,7 @@ def wrap_test_class(orig_cls):
         test_dtensor.TestDTensorCompile
     )
 
-xfail_hops = {}
+xfail_hops = {"local_map_hop"}
 
 
 class TestCompiledAutogradOpInfo(TestCase):
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index c313348e93346..4c3d394b3e9f6 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -899,7 +899,7 @@ def test_get_value_on_static_address(self):
         compiled = torch.compile(_get_value)
 
         x = torch.ones(2, 2)
-        mark_static_address(x)
+        mark_static_address(x, guard=True)
 
         ret_val = compiled(x)
 
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index 10e7c3068f10a..6c0ff3f0ce03b 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -128,6 +128,20 @@ def forward(self, x, h=None):
 class CPUReproTests(TestCase):
     common = check_model
 
+    def test_torch_linalg_qr_tuple_slice(self):
+        def fn(x):
+            return torch.linalg.qr(x)[:1]
+
+        x = torch.randn(4, 4)
+        compiled = torch.compile(fn, backend="inductor")
+
+        expected = fn(x)
+        actual = compiled(x)
+
+        self.assertIsInstance(actual, tuple)
+        self.assertEqual(len(actual), 1)
+        torch.testing.assert_close(actual[0], expected[0])
+
     @skipIfRocm
     def test_conv_stride_constraints(self):
         for fmt in [torch.contiguous_format, torch.channels_last]:
@@ -202,6 +216,34 @@ def forward(self, x):
                 (v,),
             )
 
+    def test_complex_cholesky_mh_view_fallback(self):
+        torch.manual_seed(0)
+
+        n = 8
+
+        def fn(inp: torch.Tensor):
+            I0 = torch.eye(n, dtype=inp.dtype, device=inp.device)
+            I = I0.unsqueeze(0).expand(inp.shape[0], n, n).contiguous()
+            hermitian = I + 0.5 * (inp @ inp.mH)
+            chol = torch.linalg.cholesky(hermitian, upper=True)
+            return chol.abs().sum()
+
+        base = torch.randn(4, n, n, dtype=torch.complex64)
+
+        def run(compiled_fn):
+            inp = base.clone().detach().requires_grad_(True)
+            loss = compiled_fn(inp)
+            loss.backward()
+            return loss.detach(), inp.grad.detach()
+
+        expected_loss, expected_grad = run(fn)
+
+        compiled = torch.compile(fn, backend="inductor")
+        actual_loss, actual_grad = run(compiled)
+
+        torch.testing.assert_close(actual_loss, expected_loss)
+        torch.testing.assert_close(actual_grad, expected_grad)
+
     def test_nn_fold(self):
         # Fix https://github.com/pytorch/pytorch/issues/147848
 
@@ -4348,6 +4390,39 @@ def forward(self, x):
                 actual = compiled_m(x)
                 self.assertEqual(expected, actual)
 
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    @config.patch(emulate_precision_casts=True)
+    def test_group_norm_backward_symint_divisible_channels(self):
+        def fn(x, weight, bias):
+            y = torch.nn.functional.group_norm(x, 1, weight=weight, bias=bias)
+            return torch.sigmoid(y.max(dim=0).values)
+
+        torch._dynamo.reset()
+        metrics.reset()
+
+        shape = (2, 33, 4, 5)
+        x_ref = torch.rand(shape, dtype=torch.float32, requires_grad=True)
+        weight_ref = torch.rand((33,), dtype=torch.float32, requires_grad=True)
+        bias_ref = torch.rand((33,), dtype=torch.float32, requires_grad=True)
+
+        x_cmp = x_ref.clone().detach().requires_grad_(True)
+        weight_cmp = weight_ref.clone().detach().requires_grad_(True)
+        bias_cmp = bias_ref.clone().detach().requires_grad_(True)
+
+        eager_out = fn(x_ref, weight_ref, bias_ref)
+        eager_out.sum().backward()
+
+        compiled = torch.compile(fn, backend="inductor", fullgraph=True, dynamic=True)
+        compiled_out = compiled(x_cmp, weight_cmp, bias_cmp)
+        compiled_out.sum().backward()
+
+        torch.testing.assert_close(compiled_out, eager_out)
+        torch.testing.assert_close(x_cmp.grad, x_ref.grad)
+        torch.testing.assert_close(weight_cmp.grad, weight_ref.grad)
+        torch.testing.assert_close(bias_cmp.grad, bias_ref.grad)
+
     def test_int_div_vec(self):
         def fn(x, y, mode):
             return torch.div(x, y, rounding_mode=mode)
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index fe1e59bd7f49a..ad27dd3190f8b 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -2910,6 +2910,37 @@ def forward(self, u, v):
         with verify(u.dtype) as (atol, rtol):
             self.common(mod, (u, v))
 
+    @unittest.skipIf(
+        not torch._C._cpu._is_amx_tile_supported(), "AMX ISA support is required"
+    )
+    @inductor_config.patch({"freezing": True})
+    @patches
+    @torch.no_grad
+    @parametrize("batch_size", (1024,))
+    @parametrize("in_features", (1024,))
+    @parametrize("out_features", (2048,))
+    @dtypes(torch.bfloat16)
+    def test_linear_reuse_kernels(self, batch_size, in_features, out_features, dtype):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear_x = torch.nn.Linear(in_features, out_features)
+                self.linear_y = torch.nn.Linear(out_features, in_features)
+                self.linear_z = torch.nn.Linear(in_features, out_features)
+
+            def forward(self, x):
+                out = self.linear_x(x)
+                out = self.linear_y(out)
+                out = self.linear_z(out)
+                return out
+
+        x = torch.randn(batch_size, in_features).to(dtype=dtype)
+        mod = M().to(dtype=dtype).eval()
+        self.common(mod, (x))
+        _, code = run_and_get_cpp_code(mod, x)
+        # Check that only 2 kernels are in the generated code
+        assert code.count("AMXState amx_state") == 2
+
 
 @dynamo_config.patch({"dynamic_shapes": True, "assume_static_by_default": False})
 class _DynamicShapesTestBase(BaseTestSelectAlgorithm):
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index c3e362ab82f1c..1c67e0a5925da 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 # ruff: noqa: F841
 
+import copy
 import functools
 import gc
 import math
@@ -88,6 +89,19 @@ class CudaReproTests(TestCase):
     device = "cuda"
     common = check_model_cuda
 
+    def test_mm_out_dtype_compile(self):
+        a = torch.randn(1, 3, device="cuda", dtype=torch.float16)
+        b = torch.randn(3, 2, device="cuda", dtype=torch.float16)
+
+        def fn(x, y):
+            return torch.mm(x, y, out_dtype=torch.float32)
+
+        compiled = torch.compile(fn, backend="inductor", fullgraph=True)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result.dtype, expected.dtype)
+        self.assertEqual(result, expected)
+
     def test_index_put_issue(self):
         def forward(
             self,
@@ -122,6 +136,47 @@ def forward(
         compiled = compile_fx_inner(mod, inps)
         compiled(inps)
 
+    def test_view_replay_padding_issue_163328(self):
+        class ReproModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.num_points_out = 120
+                self.lc_num = 2
+                input_channels = 16
+                self.linear_main = nn.Linear(input_channels, self.num_points_out * 2)
+                self.linear_lc = nn.Linear(input_channels, self.num_points_out * 2)
+
+            def forward(self, x: torch.Tensor):
+                bs, num_lat, num_lon, channels = x.shape
+                index = num_lat - self.lc_num
+
+                main_x = x[:, :index].reshape(bs * index * num_lon, channels)
+                lc_x = x[:, index:].reshape(bs * self.lc_num * num_lon, channels)
+
+                refline = self.linear_main(main_x).reshape(bs, index, num_lon, -1)
+                lc_refline = self.linear_lc(lc_x).reshape(bs, self.lc_num, num_lon, -1)
+
+                base = torch.cat([refline, lc_refline], dim=1).contiguous()
+                out0 = base.reshape(bs, num_lat, num_lon, self.num_points_out, 2)
+                out1 = base.reshape(bs, num_lat * num_lon, self.num_points_out * 2)
+                return {"ten0": out0, "ten1": out1}
+
+        torch.manual_seed(0)
+        model = ReproModule().cuda()
+        inputs = torch.randn(36, 9, 7, 16, device="cuda", requires_grad=True)
+
+        eager_out = model(inputs)
+        compiled_model = torch.compile(
+            copy.deepcopy(model),
+            backend="inductor",
+            mode="reduce-overhead",
+            fullgraph=True,
+        )
+        compiled_out = compiled_model(inputs)
+
+        self.assertEqual(compiled_out["ten0"], eager_out["ten0"])
+        self.assertEqual(compiled_out["ten1"], eager_out["ten1"])
+
     def test_effn_attn_bias_padding(self):
         batch_size, num_heads, seq_len, head_dim = 2, 32, 512, 128
 
@@ -925,6 +980,18 @@ def test_scatter_index_not_wrapped(self):
             out, torch.scatter_reduce(input_orig.clone(), 0, index, src, "sum")
         )
 
+    def test_normalize_norm_leq_one(self):
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            return torch.nn.functional.normalize(x, dim=-1)
+
+        inp = torch.tensor([[3.799999, 0.0, 0.0]], device="cuda", dtype=torch.float32)
+        compiled = torch.compile(fn, backend="inductor", fullgraph=True)
+        out = compiled(inp)
+        norm = out.norm(dim=-1)
+        self.assertTrue(
+            torch.all(norm <= 1.0), f"expected norm <= 1.0 but got {norm.item()}"
+        )
+
     def test_libdevice_routing(self):
         def foo(x):
             return x.exp()
@@ -1350,7 +1417,7 @@ def forward(self, x):
             out2 = model(input2)
             out3 = model(input3)
 
-        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.frame_count, 2)
 
     @config.patch({"triton.cudagraphs": True})
     def test_index_put_no_fallback_cudagraph(self):
@@ -1372,6 +1439,128 @@ def fn(x, y, z):
 
         self.assertEqual(ref, res)
 
+    @torch._inductor.config.patch(emulate_precision_casts=True)
+    def test_emulate_precision_casts_norm_rounding(self):
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+
+        x = torch.rand(1000, device="cuda", dtype=torch.bfloat16)
+        scalar = torch.rand([], device="cuda", dtype=torch.float32)
+
+        def fn(inp, scale):
+            y = inp.norm()
+            return y, y + scale
+
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True, dynamic=True)
+
+        expected = fn(x, scalar)
+        actual = opt_fn(x, scalar)
+
+        self.assertEqual(expected, actual)
+
+    @torch._inductor.config.patch(emulate_precision_casts=True)
+    def test_emulate_precision_casts_min_pow_chain(self):
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+
+        with dynamo_config.patch(
+            capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+        ):
+            arg0 = torch.rand(
+                [383, 55, 2, 3],
+                dtype=torch.float16,
+                device="cuda",
+                requires_grad=True,
+            )
+            arg1 = torch.rand(
+                [383, 55], dtype=torch.bfloat16, device="cuda", requires_grad=True
+            )
+            arg2 = torch.rand(
+                [383, 55], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+            arg3 = torch.rand(
+                [383, 55], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+
+            def fn(a0, a1, a2, a3):
+                t1 = a0.min(dim=2).values
+                t2 = t1.sum(dim=2)
+                t6 = ((((a1) - a2) - a3) - a3) - a3
+                t7 = t6 + t2
+                t8 = torch.pow(torch.pow(torch.pow(torch.pow(t2, t7), t7), t7), t7)
+                return t7, t8
+
+            opt_fn = torch.compile(fn, backend="inductor", fullgraph=True, dynamic=True)
+
+            eager_out = fn(arg0, arg1, arg2, arg3)
+            compiled_args = [
+                arg0.clone().detach().requires_grad_(True),
+                arg1.clone().detach().requires_grad_(True),
+                arg2.clone().detach().requires_grad_(True),
+                arg3.clone().detach().requires_grad_(True),
+            ]
+            compiled_out = opt_fn(*compiled_args)
+
+            for eager_tensor, compiled_tensor in zip(eager_out, compiled_out):
+                torch.testing.assert_close(
+                    eager_tensor,
+                    compiled_tensor,
+                    rtol=1e-3,
+                    atol=1e-3,
+                )
+
+    @torch._inductor.config.patch(emulate_precision_casts=True)
+    def test_emulate_precision_casts_mean_ratio_chain(self):
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+
+        with dynamo_config.patch(
+            capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+        ):
+            arg0 = torch.rand(
+                [125070], dtype=torch.bfloat16, device="cuda", requires_grad=True
+            )
+            arg1 = torch.rand(
+                [1895, 3, 11], dtype=torch.float16, device="cuda", requires_grad=True
+            )
+            arg2 = torch.rand(
+                [1895, 3, 11], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+            arg3 = torch.rand(
+                [1895, 3, 11], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+            arg4 = torch.rand(
+                [1895, 3, 11], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+            arg5 = torch.rand(
+                [5, 379, 165], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+
+            def fn(a0, a1, a2, a3, a4, a5):
+                t2 = a0.view(379, 165, 2).mean(dim=2)
+                t7 = ((((a1) - a2) - a3) - a2) - a4
+                t8 = t7.view(379, 165)
+                t11 = torch.nn.functional.gelu(a5).mean(dim=0)
+                t12 = t2 - t11
+                t13 = (((t2) / t8) / t11) / t12
+                return t13
+
+            opt_fn = torch.compile(fn, backend="inductor", fullgraph=True, dynamic=True)
+
+            eager_out = fn(arg0, arg1, arg2, arg3, arg4, arg5)
+            compiled_args = [
+                tensor.clone().detach().requires_grad_(True)
+                for tensor in (arg0, arg1, arg2, arg3, arg4, arg5)
+            ]
+            compiled_out = opt_fn(*compiled_args)
+
+            torch.testing.assert_close(
+                eager_out,
+                compiled_out,
+                rtol=5e-3,
+                atol=1e-1,
+            )
+
     @torch._inductor.config.patch(emulate_precision_casts=True)
     def test_dont_inplace_disjoint_accesses(self):
         # TODO - would not need mms if we could annotate donated buffer..
@@ -2217,6 +2406,109 @@ def forward(self, x):
                     f"Max diff: {torch.max(torch.abs(eager_output - compiled_output)):.6f}",
                 )
 
+    def test_qwen2_7b_sdpa_input_alignment_requires_recompile(self):
+        # SDPA constraints ensures inputs have alignment (8).
+        device = "cuda"
+
+        def forward(q_proj, k_proj, attn_mask):
+            scale = 0.08838834764831845  # 1/sqrt(128)
+
+            B = attn_mask.size(0)
+            S = attn_mask.size(3)
+            D = 128
+            d_model = q_proj.size(1)
+
+            query_states = q_proj.view(B, S, -1, D).transpose(1, 2)  # [B, Hq, S, D]
+            q = query_states.contiguous()
+
+            Hkv = k_proj.size(1) // D
+            Hq = query_states.size(1)
+
+            nrepeats = Hq // Hkv
+            key_states = k_proj.view(B, S, -1, D).transpose(1, 2)  # [B, Hkv, S, D]
+            kv_repeated = key_states[:, :, None, :].expand(B, Hkv, nrepeats, S, D)
+            kv_repeated = kv_repeated.contiguous()
+            k = kv_repeated.reshape(B, Hq, S, D)
+            v = k.clone()  # value tensor
+
+            inf = torch.scalar_tensor(
+                float("-inf"), dtype=torch.bfloat16, device=device
+            )
+            zero = torch.scalar_tensor(0.0, dtype=torch.bfloat16, device=device)
+            where = torch.where(condition=attn_mask, input=zero, other=inf)
+            pad_amount = 8 - (S % 8)
+            padded = torch.nn.functional.pad(
+                where, (0, pad_amount), value=0.0
+            )  # pad last-dim
+            sliced = padded[..., :S]  # back to [B,1,S,S]
+            attn_bias = sliced.expand(B, Hq, S, S)
+
+            sdpa_out, logsumexp, seed, offset = (
+                torch.ops.aten._scaled_dot_product_efficient_attention.default(
+                    q,
+                    k,
+                    v,
+                    attn_bias,
+                    dropout_p=0.0,
+                    is_causal=True,
+                    scale=scale,
+                    compute_log_sumexp=True,
+                )
+            )
+
+            zeros = torch.zeros(B, S, d_model, device=device, dtype=torch.bfloat16)
+            zeros = zeros.reshape(B, S, Hq, D)
+            grad_out = zeros.permute(0, 2, 1, 3)
+
+            out = (
+                torch.ops.aten._scaled_dot_product_efficient_attention_backward.default(
+                    grad_out,
+                    q,
+                    k,
+                    v,
+                    attn_bias,
+                    sdpa_out,
+                    logsumexp,
+                    seed,
+                    offset,
+                    dropout_p=0.0,
+                    scale=scale,
+                    grad_input_mask=[True, True, True, False],
+                )
+            )
+            return out
+
+        B = 2
+        S = 6144
+        D = 128
+        Hq = 28
+        Hkv = 4
+
+        example_inputs = (
+            torch.randn((B * S, Hq * D), dtype=torch.bfloat16, device=device),  # q_proj
+            torch.randn(
+                (B * S, Hkv * D), dtype=torch.bfloat16, device=device
+            ),  # k_proj
+            torch.zeros((B, 1, S, S), dtype=torch.bool, device=device),  # attn_mask
+        )
+        correct = forward(*example_inputs)
+        compiled = torch.compile(forward, dynamic=True)
+        actual = compiled(*example_inputs)
+        self.assertEqual(actual, correct)
+
+        # run once more with seqlen that isn't divisible by 8
+        S = 6102
+        example_inputs = (
+            torch.randn((S * B, Hq * D), dtype=torch.bfloat16, device=device),  # q_proj
+            torch.randn(
+                (S * B, Hkv * D), dtype=torch.bfloat16, device=device
+            ),  # k_proj
+            torch.zeros((B, 1, S, S), dtype=torch.bool, device=device),  # attn_mask
+        )
+        correct = forward(*example_inputs)
+        actual = compiled(*example_inputs)
+        self.assertEqual(actual, correct)
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_cuda_select_algorithm.py b/test/inductor/test_cuda_select_algorithm.py
new file mode 100644
index 0000000000000..1fc40c42ba199
--- /dev/null
+++ b/test/inductor/test_cuda_select_algorithm.py
@@ -0,0 +1,205 @@
+# Owner(s): ["module: inductor"]
+import functools
+import sys
+import unittest
+from unittest.mock import patch
+
+import torch
+import torch._dynamo.config as dynamo_config
+import torch._inductor.config as inductor_config
+import torch._inductor.select_algorithm as select_algorithm
+from torch._dynamo.utils import counters
+from torch._inductor.test_case import run_tests
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+)
+from torch.testing._internal.common_quantized import (
+    _calculate_dynamic_per_channel_qparams,
+)
+from torch.testing._internal.common_utils import parametrize, TEST_CUDA
+
+
+try:
+    try:
+        from . import test_cpu_select_algorithm, test_torchinductor
+    except ImportError:
+        import test_cpu_select_algorithm  # @manual=fbcode//caffe2/test/inductor:test_cpu_select_algorithm-library
+        import test_torchinductor  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
+except unittest.SkipTest:
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise
+
+check_model = test_torchinductor.check_model
+BaseTestSelectAlgorithm = test_cpu_select_algorithm.BaseTestSelectAlgorithm
+
+
+def patches(fn):
+    def skip_cache(self, choices, name, key, benchmark, hint_override=None):
+        if benchmark is None:
+            return {}
+        timings = benchmark(choices)
+        for choice, timing in timings.items():
+            if isinstance(choice, select_algorithm.ExternKernelCaller):
+                timings[choice] = timing * 1000
+        return timings
+
+    for patcher in [
+        dynamo_config.patch(verbose=True),
+        dynamo_config.patch(inline_inbuilt_nn_modules=True),
+        inductor_config.patch(
+            debug=True,
+            max_autotune=True,
+            epilogue_fusion=True,
+        ),
+        patch.object(select_algorithm, "VERIFY", dict(atol=1e-4, rtol=1e-4)),
+        patch.object(select_algorithm.AlgorithmSelectorCache, "lookup", skip_cache),
+    ]:
+        fn = patcher(fn)
+
+    @functools.wraps(fn)
+    def wrapped(*args, **kwargs):
+        counters.clear()
+        torch.manual_seed(12345)
+        return fn(*args, **kwargs)
+
+    return wrapped
+
+
+class TestSelectAlgorithmCuda(BaseTestSelectAlgorithm):
+    common = check_model
+
+    @inductor_config.patch({"freezing": True})
+    @patches
+    @torch.no_grad
+    @dtypes(torch.bfloat16)
+    @parametrize("batch_size", (1, 17, 32))
+    @parametrize("mid_dim", (1, 8))
+    @parametrize("in_features", (128, 144, 1024))
+    @parametrize("out_features", (64, 65, 1024))
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    def test_int8_woq_mm_cuda(
+        self, dtype, batch_size, mid_dim, in_features, out_features
+    ):
+        def _convert_weight_to_int8pack(w):
+            # Move to CPU for quantization calculation, then back to original device
+            device = w.device
+            w_cpu = w.cpu()
+            scale, zp = _calculate_dynamic_per_channel_qparams(
+                w_cpu.to(torch.float), torch.int8
+            )
+            scale = torch.from_numpy(scale).to(device)
+            zp = torch.from_numpy(zp).to(device)
+            w_int8 = torch.ao.quantization.fx._decomposed.quantize_per_channel(
+                input=w,
+                scales=scale,
+                zero_points=zp,
+                axis=0,
+                quant_min=-128,
+                quant_max=127,
+                dtype=torch.int8,
+            )
+            return w_int8, scale.to(torch.bfloat16)
+
+        class M(torch.nn.Module):
+            def __init__(self, w):
+                super().__init__()
+                self.linear_weight = torch.nn.Parameter(w, requires_grad=False)
+
+            def forward(self, x, scale):
+                return (
+                    torch.nn.functional.linear(x, self.linear_weight.to(x.dtype))
+                    * scale
+                )
+
+        counters.clear()
+        # Currently, the corresponding torch.fx pattern only supports 3D x
+        # Add 2D X case once the corresponding pattern-matcher pattern is added
+        x = torch.rand((batch_size, mid_dim, in_features), dtype=dtype, device="cuda")
+        w = torch.rand((out_features, in_features), dtype=dtype, device="cuda")
+        w_int8pack, w_scales = _convert_weight_to_int8pack(w)
+        w_scales = w_scales.to("cuda")
+        mod = M(w_int8pack).eval()
+        self.common(mod, (x, w_scales))
+        self.assertEqual(counters["inductor"]["woq_matcher_count"], 1)
+
+    @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True})
+    @patches
+    @torch.no_grad
+    @dtypes(torch.bfloat16)
+    @parametrize("batch_size", (1, 32))
+    @parametrize("mid_dim", (1, 8))
+    @parametrize("in_features", (128,))
+    @parametrize("out_features", (64,))
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    def test_int8_woq_mm_concat_cuda(
+        self, dtype, batch_size, mid_dim, in_features, out_features
+    ):
+        def _convert_weight_to_int8pack(w):
+            # Move to CPU for quantization calculation, then back to original device
+            device = w.device
+            w_cpu = w.cpu()
+            scale, zp = _calculate_dynamic_per_channel_qparams(
+                w_cpu.to(torch.float), torch.int8
+            )
+            scale = torch.from_numpy(scale).to(device)
+            zp = torch.from_numpy(zp).to(device)
+            w_int8 = torch.ao.quantization.fx._decomposed.quantize_per_channel(
+                input=w,
+                scales=scale,
+                zero_points=zp,
+                axis=0,
+                quant_min=-128,
+                quant_max=127,
+                dtype=torch.int8,
+            )
+            return w_int8, scale.to(torch.bfloat16)
+
+        class M(torch.nn.Module):
+            def __init__(self, w1, w2, w3):
+                super().__init__()
+                self.w1 = torch.nn.Parameter(w1, requires_grad=False)
+                self.w2 = torch.nn.Parameter(w2, requires_grad=False)
+                self.w3 = torch.nn.Parameter(w3, requires_grad=False)
+
+            def forward(self, x, scale1, scale2, scale3):
+                # Ref: _linear_fp_act_int8_weight_impl in torchao/dtypes/uintx/plain_layout.py
+                y1 = (
+                    torch.mm(x.reshape(-1, x.shape[-1]), self.w1.t().to(x.dtype))
+                    * scale1
+                )
+                y2 = (
+                    torch.mm(x.reshape(-1, x.shape[-1]), self.w2.t().to(x.dtype))
+                    * scale2
+                )
+                y3 = (
+                    torch.mm(x.reshape(-1, x.shape[-1]), self.w3.t().to(x.dtype))
+                    * scale3
+                )
+                return (
+                    y1.reshape(*x.shape[:-1], y1.shape[-1]),
+                    y2.reshape(*x.shape[:-1], y2.shape[-1]),
+                    y3.reshape(*x.shape[:-1], y3.shape[-1]),
+                )
+
+        counters.clear()
+        # Currently, the corresponding torch.fx pattern only supports 3D x
+        # Add 2D X case once the corresponding pattern-matcher pattern is added
+        x = torch.rand((batch_size, mid_dim, in_features), dtype=dtype, device="cuda")
+        w1 = torch.rand((out_features, in_features), dtype=dtype, device="cuda")
+        w2 = torch.rand((out_features, in_features), dtype=dtype, device="cuda")
+        w3 = torch.rand((out_features, in_features), dtype=dtype, device="cuda")
+        w1_int8pack, w1_scales = _convert_weight_to_int8pack(w1)
+        w2_int8pack, w2_scales = _convert_weight_to_int8pack(w2)
+        w3_int8pack, w3_scales = _convert_weight_to_int8pack(w3)
+        mod = M(w1_int8pack, w2_int8pack, w3_int8pack).eval()
+        self.common(mod, (x, w1_scales, w2_scales, w3_scales))
+        self.assertEqual(counters["inductor"]["woq_matcher_count"], 3)
+
+
+instantiate_device_type_tests(TestSelectAlgorithmCuda, globals(), only_for="cuda")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index a4a3aa65c42c5..708c0f55640b9 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -5,6 +5,7 @@
 import gc
 import importlib
 import itertools
+import re
 import sys
 import unittest
 import warnings
@@ -912,6 +913,67 @@ def test_unaligned_static_input_non_trees(self):
         def test_unaligned_static_input_no_cudagraphs(self):
             self._test_unaligned_static_input_impl(expected_clones=0)
 
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._inductor.config.patch("implicit_fallbacks", True)
+        def test_graph_partition_custom_rule(self):
+            def get_num_partitions(code):
+                code = "".join(code)
+                found = re.search(r"partitions=\[(.*)\]", code)
+                assert found is not None
+                partitions = found.group(1)
+                num_partitions = len([p for p in partitions.split(",") if p])
+                return num_partitions
+
+            @torch.library.custom_op("mylib::bar", mutates_args=())
+            def bar(x: torch.Tensor, flag: int) -> torch.Tensor:
+                return x.clone()
+
+            @bar.register_fake
+            def _(x, flag):
+                return x.clone()
+
+            def f(x, flag):
+                x = x + 1
+                x = bar(x, flag)
+                x = x + 1
+                return x
+
+            x = torch.randn(2, device="cuda")
+            f_compiled = torch.compile(f, mode="reduce-overhead", fullgraph=True)
+            _, code = run_and_get_code(f_compiled, x, True)
+            num_partitions = get_num_partitions(code)
+            self.assertEqual(num_partitions, 1)
+
+            @torch.library.custom_op("mylib::baz", mutates_args=())
+            def baz(x: torch.Tensor, flag: int) -> torch.Tensor:
+                return x.clone()
+
+            @baz.register_fake
+            def _(x, flag):
+                return x.clone()
+
+            def should_partition(x, flag):
+                return flag
+
+            torch._inductor.scheduler.register_should_partition_rule(
+                torch.ops.mylib.baz.default, should_partition
+            )
+
+            def f(x, flag):
+                x = x + 1
+                x = baz(x, flag)
+                x = x + 1
+                return x
+
+            f_compiled = torch.compile(f, mode="reduce-overhead", fullgraph=True)
+            _, code = run_and_get_code(f_compiled, x, True)
+            num_partitions = get_num_partitions(code)
+            self.assertEqual(num_partitions, 2)
+
+            _, code = run_and_get_code(f_compiled, x, False)
+            num_partitions = get_num_partitions(code)
+            self.assertEqual(num_partitions, 1)
+
         @torch._inductor.config.patch("graph_partition", True)
         @torch._inductor.config.patch("triton.cudagraph_trees", False)
         def test_graph_partition_gc(self):
@@ -1756,25 +1818,19 @@ def foo(args):
                     args.clear()
                     return (x + 3,)
 
-                inp = torch.rand([20, 20], device="cuda:1")
+                inp = torch.rand([20, 20], device=f"cuda:{self.device_idx}")
 
                 inp_list = [inp]
-                foo_cg = tree_cudagraphify_impl(
-                    foo,
-                    inp_list,
-                    (),
-                    device_index=1,
-                    is_backward=False,
-                    is_inference=True,
-                )
+                foo_cg = self.cudagraphify_impl(foo, inp_list, ())
                 for _ in range(3):
                     self.assertEqual(foo_cg([inp]), foo([inp]))
 
-                self.assertTrue(self.get_manager(device_index=0) is None)
-                self.assertFalse(self.get_manager(device_index=1) is None)
+                next_idx = (self.device_idx + 1) % torch.cuda.device_count()
+                self.assertTrue(self.get_manager(device_index=next_idx) is None)
+                self.assertFalse(self.get_manager(device_index=self.device_idx) is None)
 
             test()
-            self.assertTrue(self.get_manager(device_index=1) is None)
+            self.assertTrue(self.get_manager(device_index=self.device_idx) is None)
 
         def test_error_on_dealloc_use(self):
             @torch.compile()
@@ -2084,19 +2140,19 @@ def test_storage_access_error(self):
                 device = x.untyped_storage()
 
         def test_side_stream_memory_allocation(self):
-            from torch._inductor.cudagraph_trees import cudagraphify_impl
+            device = f"cuda:{self.device_idx}"
 
             def multi_stream_allocation(args):
                 side_stream = torch.cuda.Stream()
                 side_stream.wait_stream(torch.cuda.current_stream())
                 with torch.cuda.stream(side_stream):
                     side_stream_buffer = torch.ones(
-                        *args, device="cuda:0", dtype=torch.float32
+                        *args, device=device, dtype=torch.float32
                     )
                 torch.cuda.current_stream().wait_stream(side_stream)
 
                 main_stream_buffer = torch.ones(
-                    *args, device="cuda:0", dtype=torch.float32
+                    *args, device=device, dtype=torch.float32
                 )
 
                 if isinstance(args, list):
@@ -2104,17 +2160,17 @@ def multi_stream_allocation(args):
 
                 return main_stream_buffer, side_stream_buffer
 
-            graphed_multi_stream_func = cudagraphify_impl(
+            graphed_multi_stream_func = tree_cudagraphify_impl(
                 multi_stream_allocation,
                 inputs=[],
                 static_input_idxs=[],
                 is_backward=False,
                 is_inference=False,
-                device_index=0,
+                device_index=self.device_idx,
                 stack_traces=["dummy stack trace1", "dummy stack trace2"],
             )
 
-            ref_out = torch.ones((2, 3), device="cuda:0", dtype=torch.float32)
+            ref_out = torch.ones((2, 3), device=device, dtype=torch.float32)
 
             for _ in range(3):
                 torch.compiler.cudagraph_mark_step_begin()
@@ -2684,8 +2740,6 @@ def iter(batch_size: int, mod: torch.nn.Module):
                 for batch_size in range(10, 200, 10):
                     iter(batch_size, mod)
 
-            print(captured_output)
-
             FileCheck().check_count(
                 "CUDAGraph supports dynamic shapes by recording a new graph for each "
                 "distinct input size. Recording too many CUDAGraphs may lead to "
@@ -3233,6 +3287,60 @@ def fn(x):
             # splitting on 1 custom gives 2 cudagraphs
             self.assertEqual(self.get_manager().new_graph_id().id, 2)
 
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_custom_op_mutation_late_free(self):
+            @torch.library.custom_op(
+                "mylib::op1",
+                mutates_args=["x"],
+                schema="(Tensor(a!)?  x) -> (Tensor, Tensor)",
+                device_types="cuda",
+            )
+            def op1(x) -> tuple[torch.Tensor, torch.Tensor]:
+                x = x + 1
+                return (x + 1, x + 2)
+
+            @op1.register_fake
+            def _(x) -> tuple[torch.Tensor, torch.Tensor]:
+                return (torch.empty_like(x), torch.empty_like(x))
+
+            @torch.library.custom_op(
+                "mylib::cg_unsafe_op",
+                mutates_args=[],
+                schema="(Tensor x, Tensor y, Tensor x1, Tensor y1) -> Tensor",
+                device_types="cuda",
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def cg_unsafe_op(x0, x1, y0, y1) -> torch.Tensor:
+                return x0 + x1 + y0 + y1
+
+            @cg_unsafe_op.register_fake
+            def _(x0, x1, y0, y1) -> torch.Tensor:
+                return torch.empty_like(x0)
+
+            def f(x):
+                x = x + 1
+                x = op1(x)
+                x0, x1 = x[0], x[1]
+                y0 = x0 + 1
+                y1 = x1 + 1
+                y = cg_unsafe_op(x0, x1, y0, y1)
+                z = y + x0 + x1
+                z0, z1 = op1(z)
+                z2 = z0 + z1
+                res = cg_unsafe_op(z2, z2, y, y)
+                return res
+
+            x = torch.randn(2, 2, device="cuda")
+            x_cloned = x.clone()
+            eager_out = f(x)
+
+            f_compiled = torch.compile(f, mode="reduce-overhead")
+
+            for _ in range(5):
+                compiled_out = f_compiled(x_cloned)
+                self.assertEqual(eager_out, compiled_out)
+
         @config.patch(implicit_fallbacks=True)
         @torch._inductor.config.patch("graph_partition", True)
         def test_graph_partition_custom_op_dynamoc_shapes(self):
@@ -3864,9 +3972,8 @@ def foobar(x, y):
             self.assertEqual(eager_out, compiled_out)
             self.assertEqual(self.get_manager().new_graph_id().id, 1)
 
+        @torch._inductor.config.patch("triton.cudagraph_capture_sizes", (2, 5, 7))
         def test_cudagraph_capture_sizes(self):
-            torch._inductor.config.triton.cudagraph_capture_sizes = (2, 5, 7)
-
             def f(x):
                 return x + 1
 
@@ -3883,14 +3990,16 @@ def run(shape):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 3)
 
-        def test_cudagraph_capture_sizes1(self):
-            torch._inductor.config.triton.cudagraph_capture_sizes = (
+        @torch._inductor.config.patch(
+            "triton.cudagraph_capture_sizes",
+            (
                 (2, 3),
                 (4, 5),
                 (6, 2),
                 (7, 3),
-            )
-
+            ),
+        )
+        def test_cudagraph_capture_sizes1(self):
             def f(x):
                 return x + 1
 
@@ -3909,14 +4018,16 @@ def run(batch_size, seq_len, d):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 4)
 
-        def test_cudagraph_capture_sizes2(self):
-            torch._inductor.config.triton.cudagraph_capture_sizes = (
+        @torch._inductor.config.patch(
+            "triton.cudagraph_capture_sizes",
+            (
                 (2, 3, 4),
                 (4, 4, 3),
                 (3, 4, 4),
                 (4, 2, 3),
-            )
-
+            ),
+        )
+        def test_cudagraph_capture_sizes2(self):
             def f(x):
                 return x + 1
 
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
index b807df5d6691c..149e32a2433ab 100644
--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@@ -107,13 +107,15 @@ def _check_if_instances_equal(op1, op2) -> bool:
     return True
 
 
-un_ops_under_test = [torch.relu]
+un_ops_under_test = [torch.relu, torch.tanh, torch.exp, torch.sigmoid]
 bin_ops_under_test = [torch.add, torch.mul, torch.sub, torch.div]
 
 evt_all_ops = parametrize(
     "op", un_ops_under_test + bin_ops_under_test, name_fn=lambda f: f.__name__
 )
 
+evt_un_ops = parametrize("op", un_ops_under_test, name_fn=lambda f: f.__name__)
+
 evt_bin_ops = parametrize("op", bin_ops_under_test, name_fn=lambda f: f.__name__)
 
 evt_all_shapes = parametrize("shape", itertools.product([512, 1024], repeat=2))
@@ -255,7 +257,7 @@ def test_import_cutlass(self):
         if config.is_fbcode():
             import python_cutlass
         else:
-            import cutlass as python_cutlass  # noqa: F401
+            import cutlass_cppgen as python_cutlass  # noqa: F401
         import cutlass_library  # noqa: F401
 
     def test_cutlass_key(self):
@@ -1976,6 +1978,30 @@ def forward(self, a, b, extra_args):
         )
         torch.testing.assert_close(result, ref_result)
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_un_ops
+    def test_evt_activations(self, op):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                return acc, op(acc, *extra_args)
+
+        M = 1024
+        N = 512
+        a = torch.ones(M, N).cuda().half()
+        b = torch.ones(N, N).cuda().half().t()
+        extra_args = gen_args(op, (M, N))
+        model = TestModel().cuda()
+
+        result = torch.compile(model)(a, b, extra_args)
+        ref_result = model(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1
+        )
+        torch.testing.assert_close(result, ref_result)
+
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @use_evt_config
     @evt_all_ops
diff --git a/test/inductor/test_cutlass_evt.py b/test/inductor/test_cutlass_evt.py
index cae9558d2ec2a..66f03762fa1b8 100644
--- a/test/inductor/test_cutlass_evt.py
+++ b/test/inductor/test_cutlass_evt.py
@@ -36,7 +36,7 @@
     if config.is_fbcode():
         import python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
     else:
-        import cutlass as python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
+        import cutlass_cppgen as python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
     CutlassTensor = python_cutlass.backend.evt.ir.tensor.Tensor
 
     BIAS_CODE = """def example_epilogue(accum, C, aux, bias):
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index d2a5019d47966..ed796b0ffbb1a 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -21,6 +21,7 @@
 from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import run_and_get_code
+from torch.nn.attention import SDPBackend
 from torch.nn.attention.experimental._paged_attention import PagedAttention
 from torch.nn.attention.flex_attention import (
     _create_empty_block_mask,
@@ -146,6 +147,8 @@ def rmse(ref, res):
     """
     Calculate root mean squared error
     """
+    ref = ref.to(torch.float64)
+    res = res.to(torch.float64)
     return torch.sqrt(torch.mean(torch.square(ref - res)))
 
 
@@ -1967,6 +1970,7 @@ def score_mod_scale(qk, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize(
         "score_mod", test_score_mods, name_fn=lambda score_mod: score_mod.__name__
     )
@@ -2060,6 +2064,7 @@ def test_return_max(self, device, dtype, score_mod):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize(
         "score_mod", test_score_mods, name_fn=lambda score_mod: score_mod.__name__
     )
@@ -2142,6 +2147,7 @@ def test_return_aux(self, device, dtype, score_mod):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @skip_on_cpu
     def test_return_aux_deprecation_warnings(self, device, dtype):
         """Test that deprecation warnings are issued for legacy parameters"""
@@ -2195,6 +2201,7 @@ def test_return_aux_deprecation_warnings(self, device, dtype):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @skip_on_cpu
     def test_dynamic_divisibility_guards(self, device, dtype):
         """Test guards for divisible/non-divisible shape transitions"""
@@ -4141,7 +4148,7 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
             """\
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f64[2, 2, 128, 4]", primals_2: "f64[2, 2, 128, 4]", primals_3: "f64[2, 2, 128, 4]", full: "i32[1, 1, 1]", full_default: "i32[1, 1, 1, 1]", convert_element_type: "i32[1, 1, 1]", convert_element_type_1: "i32[1, 1, 1, 1]", getitem_2: "f64[2, 2, 128, 4]", getitem_3: "f32[2, 2, 128]", tangents_1: "f64[2, 2, 128, 4]"):
-        full_default_4: "f32[2, 2, 128]" = torch.ops.aten.full.default([2, 2, 128], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
+        full_default_4: "f32[2, 2, 128]" = torch.ops.aten.full.default([2, 2, 128], 0, dtype = torch.float32, layout = torch.strided, device = device(type='GPU_TYPE', index=0), pin_memory = False)
         fw_graph0 = self.fw_graph0
         joint_graph0 = self.joint_graph0
         mask_graph0 = self.mask_graph0
@@ -4165,7 +4172,7 @@ def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i3
 
     class mask_graph0(torch.nn.Module):
         def forward(self, arg0_1: "i32[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i32[]"):
-            full_default: "b8[]" = torch.ops.aten.full.default([], True, dtype = torch.bool, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
+            full_default: "b8[]" = torch.ops.aten.full.default([], True, dtype = torch.bool, layout = torch.strided, device = device(type='GPU_TYPE', index=0), pin_memory = False)
             return full_default
 """.replace(  # noqa: B950
                 "GPU_TYPE", torch.device(device).type
@@ -4354,6 +4361,89 @@ def forward(self, q, k, v, block_mask):
         attn_output = mod(q, k, v, mask)
         self.assertEqual(attn_output.device, torch.device("cuda:1"))
 
+    @supported_platform
+    @skip_on_cpu
+    def test_custom_score_mod_layout_freeze(self, device):
+        torch.manual_seed(0)
+
+        class FlexAttentionCPB(nn.Module):
+            def __init__(self, N: int, R: int, H: int = 4, hidden: int = 32):
+                super().__init__()
+                self.mlp = nn.Sequential(
+                    nn.Linear(2, hidden),
+                    nn.GELU(),
+                    nn.Linear(hidden, H, bias=False),
+                )
+                self.gamma = nn.Parameter(torch.zeros(H))
+                self.H = H
+                self._init_tables(N, R)
+                self.register_buffer(
+                    "r_cutoff", torch.tensor(R, dtype=torch.long), persistent=False
+                )
+
+            def _init_tables(self, N: int, R: int) -> None:
+                P = N - R
+                S = int(P**0.5)
+                assert S * S == P
+                rng = torch.arange(-(S - 1), S, dtype=torch.float32)
+                dY, dX = torch.meshgrid(rng, rng, indexing="ij")
+                rel = torch.stack(
+                    [dY / max(S - 1, 1), dX / max(S - 1, 1)], dim=-1
+                ).reshape(-1, 2)
+                rel_table = torch.sign(rel) * torch.log1p(rel.abs())
+                self.register_buffer("rel_table", rel_table, persistent=False)
+
+                yy, xx = torch.arange(S), torch.arange(S)
+                Y, X = torch.meshgrid(yy, xx, indexing="ij")
+                flat = torch.stack([Y, X], 0).flatten(1)
+                d = flat[:, :, None] - flat[:, None, :]
+                d = d.permute(1, 2, 0).contiguous()
+                d[:, :, 0] += S - 1
+                d[:, :, 1] += S - 1
+                d[:, :, 0] *= 2 * S - 1
+                l_idx = d.sum(-1).to(torch.long)
+                idx = torch.full((N, N), 0, dtype=torch.long)
+                idx[R:, R:] = l_idx
+                self.register_buffer("idx_table", idx, persistent=False)
+
+            def _score_mod(self, mu: torch.Tensor):
+                bt = self.mlp(self.rel_table)
+                idx = self.idx_table
+                mu_q, mu_k = mu.unbind(2)
+                gam_sig = torch.sigmoid(self.gamma)
+
+                def score_mod(score, b, h, q, kv):
+                    has_bias = (q >= self.r_cutoff) & (kv >= self.r_cutoff)
+                    l2 = idx[q, kv]
+                    bias = bt[l2, h]
+                    w_gate = gam_sig[h] * (mu_q[b, h, q] + mu_k[b, h, kv])
+                    return score + has_bias.to(score.dtype) * w_gate * bias
+
+                return score_mod
+
+            def forward(self, q, k, v, mu):
+                return flex_attention(q, k, v, score_mod=self._score_mod(mu))
+
+        dtype = torch.bfloat16 if PLATFORM_SUPPORTS_BF16 else torch.float16
+        device_obj = torch.device(device)
+        module = FlexAttentionCPB(N=18, R=2).to(device_obj)
+        compiled_module = torch.compile(module, backend="inductor", dynamic=False)
+
+        q = torch.randn(2, 4, 18, 32, device=device_obj, dtype=dtype)
+        k = torch.randn_like(q)
+        v = torch.randn_like(q)
+        mu = torch.randn(2, 4, 2, 18, device=device_obj)
+
+        with torch.no_grad():
+            with torch.nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+                eager_out = module(q, k, v, mu)
+                compiled_out = compiled_module(q, k, v, mu)
+
+        self.assertEqual(compiled_out.shape, eager_out.shape)
+        torch.testing.assert_close(
+            compiled_out.float(), eager_out.float(), atol=2e-2, rtol=2e-2
+        )
+
     @supported_platform
     @skip_on_cpu
     @common_utils.parametrize(
@@ -4666,8 +4756,8 @@ def causal_mask(b, h, q, kv):
 
         block_mask = create_block_mask(causal_mask, 4, 2, 2048, 2048, device=device)
         self.assertEqual(block_mask.shape, (4, 2, 2048, 2048))
-        self.assertEqual(block_mask[0].shape, (2, 2048, 2048))
-        self.assertEqual(block_mask[0, 0].shape, (2048, 2048))
+        self.assertEqual(block_mask[0].shape, (1, 2, 2048, 2048))
+        self.assertEqual(block_mask[0, 0].shape, (1, 1, 2048, 2048))
         self.assertEqual(block_mask.numel(), 4 * 2 * 2048 * 2048)
         self.assertEqual(block_mask.sparsity(), 46.875)
         self.assertEqual(block_mask[0].sparsity(), 46.875)
@@ -4711,13 +4801,26 @@ def causal_mask(b, h, q, kv):
 
         # Index on batch dimension
         new_block_mask = block_mask[0]
-        assert new_block_mask.kv_num_blocks.shape == (2, 4)
-        assert new_block_mask.kv_indices.shape == (2, 4, 4)
+        assert new_block_mask.kv_num_blocks.shape == (1, 2, 4)
+        assert new_block_mask.kv_indices.shape == (1, 2, 4, 4)
 
         # Index on batch and head dimension
         new_block_mask = block_mask[0, 1]
-        assert new_block_mask.kv_num_blocks.shape == (4,)
-        assert new_block_mask.kv_indices.shape == (4, 4)
+        assert new_block_mask.kv_num_blocks.shape == (
+            1,
+            1,
+            4,
+        )
+        assert new_block_mask.kv_indices.shape == (1, 1, 4, 4)
+
+        # Index on batch and head dimension with -1 semantics
+        new_block_mask = block_mask[-1, -2]
+        assert new_block_mask.kv_num_blocks.shape == (
+            1,
+            1,
+            4,
+        )
+        assert new_block_mask.kv_indices.shape == (1, 1, 4, 4)
 
         # slicing on batch and head dimension
         new_block_mask = block_mask[0:2, 1:2]
@@ -5252,6 +5355,98 @@ def test_backward_error_with_none_q_indices(self, device):
         ):
             flex_compile(q, k, v, block_mask=block_mask)
 
+    @supported_platform
+    @skip_on_cpu
+    def test_flex_attention_poisoned_rel_logits(self, device):
+        B = 1
+        H = 1
+        S = 1025
+        D = 64
+        q, k, v = [
+            torch.randn(B, H, S, D, requires_grad=True, device=device) for _ in range(3)
+        ]
+        rel_logits = torch.randn(2 * B, H, S, S, device=device)
+        rel_logits[B:] = float("nan")
+
+        def score_mod(score, b, h, q, kv):
+            return score + rel_logits[b, h, q, kv]
+
+        def causal(
+            b: torch.Tensor, h: torch.Tensor, q: torch.Tensor, kv: torch.Tensor
+        ) -> torch.Tensor:
+            return q >= kv
+
+        block_mask = create_block_mask(causal, B, H, S, S, device=device)
+        out = torch.compile(flex_attention)(
+            q, k, v, score_mod=score_mod, block_mask=block_mask
+        )
+        out.sum().backward()
+
+        assert out.isfinite().all().item()
+        assert q.grad.isfinite().all().item()
+        assert k.grad.isfinite().all().item()
+        assert v.grad.isfinite().all().item()
+
+    @supported_platform
+    @skip_on_cpu
+    def test_flex_attention_poison_mod_fwd(self, device):
+        """Div by score should cause our edge case handiling to NaN"""
+        B = 1
+        H = 1
+        S = 257
+        D = 16
+        q, k, v = [
+            torch.randn(B, H, S, D, requires_grad=True, device=device) for _ in range(3)
+        ]
+
+        def score_mod(score, b, h, q, kv):
+            return 1 / score
+
+        def causal(
+            b: torch.Tensor, h: torch.Tensor, q: torch.Tensor, kv: torch.Tensor
+        ) -> torch.Tensor:
+            return q >= kv
+
+        block_mask = create_block_mask(causal, B, H, S, S, device=device)
+        out = torch.compile(flex_attention, backend="inductor")(
+            q, k, v, score_mod=score_mod, block_mask=block_mask
+        )
+        out.sum().backward()
+        assert out.isfinite().all().item()
+        assert q.grad.isfinite().all().item()
+        # assert k.grad.isfinite().all().item()
+        assert v.grad.isfinite().all().item()
+
+    @supported_platform
+    @skip_on_cpu
+    def test_flex_attention_poison_mod_bwd(self, device):
+        """log score should cause our edge case handiling for NaN in grad score"""
+        B = 1
+        H = 1
+        S = 257
+        D = 16
+        q, k, v = [
+            torch.randn(B, H, S, D, requires_grad=True, device=device) for _ in range(3)
+        ]
+
+        def score_mod(score, b, h, q, kv):
+            return torch.where(score > 0, torch.log(score), score)
+
+        def causal(
+            b: torch.Tensor, h: torch.Tensor, q: torch.Tensor, kv: torch.Tensor
+        ) -> torch.Tensor:
+            return q >= kv
+
+        block_mask = create_block_mask(causal, B, H, S, S, device=device)
+        out = torch.compile(flex_attention, backend="inductor")(
+            q, k, v, score_mod=score_mod, block_mask=block_mask
+        )
+        out.sum().backward()
+        assert out.isfinite().all().item()
+        assert q.grad.isfinite().all().item()
+        # assert k.grad.isfinite().all().item()
+        assert v.grad.isfinite().all().item()
+
     @supported_platform
     @skip_on_cpu
     def test_forward_pass_with_none_q_indices(self, device):
@@ -5310,7 +5505,7 @@ def test_block_mask_operations_with_none_q_indices(self, device):
         self.assertEqual(block_mask.BLOCK_SIZE, (128, 128))
 
         sliced_mask = block_mask[0]
-        self.assertEqual(sliced_mask.shape, (1, 128, 512))
+        self.assertEqual(sliced_mask.shape, (1, 1, 128, 512))
         self.assertIsNone(sliced_mask.q_indices)
         self.assertIsNone(sliced_mask.q_num_blocks)
 
@@ -5320,6 +5515,66 @@ def test_block_mask_operations_with_none_q_indices(self, device):
             self.assertEqual(cpu_mask.kv_num_blocks.device.type, "cpu")
             self.assertIsNone(cpu_mask.q_indices)
 
+    @supported_platform
+    @skip_on_cpu
+    def test_broadcasted_head_block_mask(self, device):
+        torch.manual_seed(42)
+
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        def get_mask_mod_with_offset(mask_mod, offset_tensor):
+            def _mask_mod(b, h, q, kv):
+                return mask_mod(b, h, q + offset_tensor, kv)
+
+            return _mask_mod
+
+        B, T, H, D, current_pos = 4, 512, 8, 64, 128
+        dtype = torch.float32
+
+        q = torch.randn(B, H, 1, D, device=device, dtype=dtype)
+        k_cache = torch.randn(B, H, T, D, device=device, dtype=dtype)
+        v_cache = torch.randn(B, H, T, D, device=device, dtype=dtype)
+
+        # Keep future tokens tiny to avoid numerical issues when using full caches
+        k_cache[:, :, current_pos + 1 :, :] = (
+            torch.randn_like(k_cache[:, :, current_pos + 1 :, :]) * 1e-10
+        )
+        v_cache[:, :, current_pos + 1 :, :] = (
+            torch.randn_like(v_cache[:, :, current_pos + 1 :, :]) * 1e-10
+        )
+
+        k_cropped = k_cache[:, :, : current_pos + 1, :]
+        v_cropped = v_cache[:, :, : current_pos + 1, :]
+        sdpa_output = torch.nn.functional.scaled_dot_product_attention(
+            q, k_cropped, v_cropped, attn_mask=None
+        )
+
+        base_mask = create_block_mask(
+            causal_mask,
+            B=B,
+            H=None,  # broadcast across heads
+            Q_LEN=T,
+            KV_LEN=T,
+            device=device,
+            _compile=True,
+        )
+
+        q_block_size = base_mask.BLOCK_SIZE[0]
+        block_offset = current_pos // q_block_size
+        mask_slice = base_mask[:, :, block_offset]
+
+        offset_tensor = torch.tensor(current_pos, device=device)
+        mask_slice.mask_mod = get_mask_mod_with_offset(
+            base_mask.mask_mod, offset_tensor
+        )
+        mask_slice.seq_lengths = (1, mask_slice.seq_lengths[1])
+
+        fa = torch.compile(flex_attention, dynamic=True)
+        flex_output = fa(q, k_cache, v_cache, block_mask=mask_slice)
+
+        self.assertEqual(flex_output, sdpa_output, atol=1e-3, rtol=1e-3)
+
 
 @large_tensor_test_class("2GB", device=test_device[0])
 class TestPagedAttention(InductorTestCase):
@@ -6381,6 +6636,35 @@ def bias_mod(score, b, h, q_idx, kv_idx):
         assert bias.grad, "No gradient computed for bias"
         assert torch.any(bias.grad != 0), "Gradient for bias is 0"
 
+    @skip_on_cpu
+    def test_backprop_error_case(self, device):
+        @torch.compile()
+        def test(x, y):
+            # Materialize a bias matrix
+            B, L, device = x.shape[0], x.shape[1], x.device
+            b = torch.arange(B, device=device, dtype=torch.long).view(B, 1, 1)
+            q_idx = torch.arange(L, device=device, dtype=torch.long).view(1, L, 1)
+            kv_idx = torch.arange(L, device=device, dtype=torch.long).view(1, 1, L)
+            bias_mat = y[b, q_idx] + y[b, kv_idx]  # (B, L, L)
+
+            # Dummy score_mod retrieving bias values
+            def score_mod(score, b, h, q_idx, kv_idx):
+                return score + bias_mat[b, q_idx, kv_idx]
+
+            x_ = x[:, :, None].repeat(1, 1, 16, 1)
+            # torch._dynamo.graph_break()
+            return flex_attention(x_, x_, x_, score_mod=score_mod)
+
+        B, L, D = 2, 16, 64
+
+        x = torch.randn(B, L, D, device=device, requires_grad=True)
+        y = torch.randn(B, L, device=device, requires_grad=True)
+
+        _ = test(x, y).mean().backward()
+
+        assert x.grad.norm() > 0
+        assert y.grad.norm() > 0
+
     @skip_on_cpu
     @common_utils.parametrize(
         "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
index 20db8a8f452b1..466e37c16609d 100644
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@@ -47,9 +47,6 @@
 
 
 Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
-# In MI300, HIPBLASLT_ALLOW_TF32=1 is used to enable tf32 for matmul.
-# In the current test, HIPBLASLT_ALLOW_TF32 is not set, according to the
-# logic of allowTF32CuBLAS(), set float32_matmul_precision to highest.
 if torch.version.hip:
     torch.set_float32_matmul_precision("highest")
 else:
diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
index a8c4030af3201..acb2a844ad19d 100644
--- a/test/inductor/test_fxir_backend.py
+++ b/test/inductor/test_fxir_backend.py
@@ -17,7 +17,6 @@
 from torch._dynamo.utils import same
 from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_mutation
 from torch._inductor import config
-from torch._inductor.codegen.common import register_backend_for_device
 from torch._inductor.codegen.cpp import CppScheduling
 from torch._inductor.codegen.triton import TritonScheduling
 from torch._inductor.codegen.wrapper import PythonWrapperCodegen
@@ -25,6 +24,7 @@
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.export import Dim
 from torch.testing._internal.common_utils import (
+    DeterministicGuard,
     instantiate_parametrized_tests,
     parametrize,
 )
@@ -42,15 +42,17 @@
 
     from torch.testing._internal.triton_utils import add_kernel_2d_autotuned
 
+test_config = {
+    "compile_threads": 1,
+    "alignment_asserts": False,
+    "size_asserts": False,
+    "scalar_asserts": False,
+    "nan_asserts": False,
+}
+
 
 @requires_gpu()
-@config.patch(
-    compile_threads=1,
-    alignment_asserts=False,
-    size_asserts=False,
-    scalar_asserts=False,
-    nan_asserts=False,
-)
+@config.patch(test_config)
 @instantiate_parametrized_tests
 class FxirTestCase(InductorTestCase):
     device = GPU_TYPE
@@ -115,8 +117,19 @@ def _compile_and_check(
     def setUpClass(cls):
         super().setUpClass()
 
-        # Register the FX backend.
-        register_backend_for_device(cls.device, TritonScheduling, WrapperFxCodegen)
+        # Register the FX backend, storing the default for later.
+        common.init_backend_registration()
+        cls._default_backend = common.device_codegens[cls.device]
+        common.register_backend_for_device(
+            cls.device, TritonScheduling, WrapperFxCodegen
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+
+        # Restore the default backend.
+        common.device_codegens[cls.device] = cls._default_backend
 
     def test_basic(self):
         args = [torch.randn(8, device=self.device) for _ in range(2)]
@@ -567,21 +580,109 @@ def compile_module(*inps):
 
         self.assertTrue(same(ref, result))
 
-    @torch._inductor.config.patch("graph_partition", True)
-    def test_subgraph_raises(self):
+    def test_scatter_fallback_scalar_src(self):
+        """
+        Test a special case where ScatterFallback takes a scalar 'src' argument.
+        """
+
+        def foo(input_):
+            dim = 0
+            src = 1.5
+            return torch.ops.aten.scatter(input_, dim, index, src)
+
+        length = 8
+        index = torch.randint(length, (length,), device=self.device)
+        input_ = torch.randn(length, device=self.device)
+        with DeterministicGuard(True):
+            (gm,) = self._compile_and_check(
+                foo,
+                (input_,),
+            )
+
+        # Check for the fallback op.
+        num_fallback = self._count_ops(gm, torch.ops.aten.scatter_.value)
+        self.assertEqual(num_fallback, 1)
+
+    def test_index_put_fallback(self):
+        """
+        Test the deterministic fallback for index_put.
+        """
+        length = 8
+        out, values = [torch.randn(length, device=self.device) for _ in range(2)]
+        indices = (torch.randint(length, (length,), device=self.device),)
+        accumulate = True
+        with DeterministicGuard(True):
+            (gm,) = self._compile_and_check(
+                torch.index_put,
+                (out, indices, values, accumulate),
+                expected_num_triton_kernels=1,
+            )
+
+        # Check for the fallback op.
+        self.assertEqual(self._count_ops(gm, torch.ops.aten.index_put_.default), 1)
+
+    def test_scatter_reduce_fallback(self):
+        """
+        Test the customized wrapper codegen for ScatterFallback ops.
+        """
+        fallback_op = torch.ops.aten.scatter_reduce_.two
+
+        def foo(out, index, src):
+            dim = 0
+            out = fallback_op(out, dim, index, src, reduce="amax", include_self=False)
+            return out + 1
+
+        length = 8
+        out, src = [torch.randn(length, device=self.device) for _ in range(2)]
+        index = torch.randint(length, (length,), device=self.device)
+        (gm,) = self._compile_and_check(
+            foo, (out, index, src), expected_num_triton_kernels=2
+        )
+
+        # Check for the fallback.
+        self.assertEqual(self._count_ops(gm, fallback_op), 1)
+
+    @parametrize("pred", (False, True))
+    def test_cond_subgraph(self, pred: bool):
+        """
+        Test a model with subgraphs.
+        """
+
+        def foo(pred, x):
+            return torch.cond(pred, torch.cos, torch.sin, [x]) + 1
+
+        x = torch.randn((2, 3), device=self.device)
+        pred_tensor = torch.tensor([pred], device=self.device)
+        gm = self._compile_and_check(
+            foo, [pred_tensor, x], expected_num_triton_kernels=3
+        )[-1]
+
+        # Check for subgraphs.
+        subgm_getattrs = list(gm.graph.find_nodes(op="get_attr"))
+        self.assertEqual(len(subgm_getattrs), 2)
+        for subgm_getattr in subgm_getattrs:
+            target = subgm_getattr.name
+            self.assertTrue(isinstance(getattr(gm, target), torch.fx.GraphModule))
+
+    @parametrize("pred", (False, True))
+    def test_cond_no_operands(self, pred: bool):
         """
-        Test a model with subgraphs. This is not yet supported, so check that we get the
-        expected exception.
+        Test torch.cond when the subgraphs take no inputs.
         """
 
-        def foo(cond, x):
-            return torch.cond(cond, torch.cos, torch.sin, [x])
+        length = 8
+
+        def true_fn():
+            return torch.zeros(length, device=self.device)
+
+        def false_fn():
+            return true_fn() + 5
 
-        cond = torch.tensor([True], device=self.device)
-        x = torch.ones([2, 3], device=self.device)
+        def foo(pred):
+            return torch.cond(pred, true_fn, false_fn, ())
 
-        with self.assertRaisesRegex(BackendCompilerFailed, "Subgraph"):
-            self._compile_and_check(foo, [cond, x])
+        pred_tensor = torch.tensor([pred], device=self.device)
+        self._compile_and_check(foo, [pred_tensor], expected_num_triton_kernels=2)
 
     def test_cpp_raises(self):
         """
@@ -682,10 +783,13 @@ def foo(x):
         self._compile_and_check(foo, args, expected_num_triton_kernels=0)
 
 
+@instantiate_parametrized_tests
 class AOTFxirTestCase(InductorTestCase):
     device = GPU_TYPE
 
-    def check(self, model, inp, dynamic_shapes=None, strict=False):
+    def check(
+        self, model, inp, dynamic_shapes=None, strict=False
+    ) -> torch.fx.GraphModule:
         if self.device == "xpu":
             raise unittest.SkipTest("The feature AOTFxir not currently ready for XPU")
         with torch.no_grad():
@@ -693,9 +797,9 @@ def check(self, model, inp, dynamic_shapes=None, strict=False):
                 model, inp, dynamic_shapes=dynamic_shapes, strict=strict
             )
             gm = torch._inductor.aot_compile(
-                ep.module(), inp, options={"fx_wrapper": True, "compile_threads": 1}
+                ep.module(), inp, options={"fx_wrapper": True, **test_config}
             )
-            self.assertTrue(torch.allclose(model(*inp), gm(*inp)))
+            self.assertTrue(same(model(*inp), gm(*inp)))
 
             for node in gm.graph.nodes:
                 if (
@@ -704,6 +808,8 @@ def check(self, model, inp, dynamic_shapes=None, strict=False):
                 ):
                     self.assertTrue(node.meta.get("val", None) is not None)
 
+            return gm
+
     def test_aoti_fx_add(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -821,6 +927,111 @@ def forward(self, x):
         # Now the backend should have been called.
         self.assertTrue(called)
 
+    @parametrize(
+        "expr",
+        [
+            (2 * Dim("x") + 1),
+            (Dim("x", min=3) - 3),
+        ],
+    )
+    def test_dynamic_input_expr(self, expr: sympy.Expr):
+        """
+        Test dynamic shapes with a nontrivial input expression.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.reshape(x.shape[0] * x.shape[1]) + x.shape[1]
+
+        dynamic_shapes = {"x": {0: expr}}
+        inp = (torch.randn((5, 4), device=self.device),)
+        gm = self.check(M().to(self.device), inp, dynamic_shapes=dynamic_shapes)
+
+        # Check for dynamic size ops.
+        self.assertEqual(
+            len(
+                gm.graph.find_nodes(
+                    op="call_function", target=torch.ops.aten.sym_size.int
+                )
+            ),
+            1,
+        )
+
+    @parametrize("pred", (False, True))
+    def test_cond_multi_inputs_and_outputs(self, pred):
+        """
+        Test torch.cond and check the output graphs.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, pred, x, y):
+                def true_fn(x, y):
+                    return torch.tanh(x), torch.relu(y)
+
+                def false_fn(x, y):
+                    return tuple(t / 2 for t in true_fn(x, y))
+
+                return torch.cond(pred, true_fn, false_fn, (x, y))
+
+        pred = torch.tensor([True], device=self.device)
+        (x, y) = [torch.randn(8, device=self.device) for _ in range(2)]
+        gm = self.check(M(), (pred, x, y))
+
+        # Check the graph.
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    cond = torch.ops.higher_order.cond(arg0_1, true_graph_0, false_graph_0, (arg1_1, arg2_1));  arg0_1 = true_graph_0 = false_graph_0 = arg1_1 = arg2_1 = None
+    buf1 = cond[0]
+    buf2 = cond[1];  cond = None
+    return [buf1, buf2]""",  # noqa: B950
+        )
+
+    @parametrize("length", (4, 8))
+    def test_cond_dynamic_shape_pred_scalar_closure(self, length: int):
+        """
+        Test cond using a predicate computed from dynamic shapes.
+        Also test a dynamic scalar computed outside the branches.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                z = x.reshape(-1)
+                a = y.shape[0]
+
+                def true_fn(x):
+                    return x + a
+
+                def false_fn(x):
+                    return true_fn(x) / 2
+
+                return torch.cond(x.shape[0] > 5, true_fn, false_fn, (z,))
+
+        (x, y) = [
+            torch.randn(shape, device=self.device)
+            for shape in [(length // 2,) * 2, (length,)]
+        ]
+        dynamic_shapes = {
+            "x": {0: Dim.DYNAMIC},
+            "y": {0: Dim.DYNAMIC},
+        }
+        self.check(M(), (x, y), dynamic_shapes=dynamic_shapes)
+
+    def test_dynamic_scalar_output(self):
+        """
+        Test an output scalar from dynamic shapes.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.shape[0] * 3
+
+        x = torch.randn(7, device=self.device)
+        self.check(M(), (x,), dynamic_shapes=({0: Dim.DYNAMIC},))
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_group_batch_fusion.py b/test/inductor/test_group_batch_fusion.py
index 090a7e8e29d3f..01c9962e0087b 100644
--- a/test/inductor/test_group_batch_fusion.py
+++ b/test/inductor/test_group_batch_fusion.py
@@ -286,6 +286,38 @@ def forward(self, x):
         return torch.stack((stack_input, stack_other), dim=0)
 
 
+class TestDropout(torch.nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.device = device
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        split = x.split([20, 20, 20, 20, 20], 1)
+        getitem_1 = split[0]
+        getitem_2 = split[1]
+        getitem_3 = split[2]
+        getitem_4 = split[3]
+        getitem_5 = split[4]
+        dropout = torch.nn.functional.dropout(
+            getitem_1, p=0.05, training=True, inplace=False
+        )
+        dropout_1 = torch.nn.functional.dropout(
+            getitem_2, p=0.05, training=True, inplace=False
+        )
+        dropout_2 = torch.nn.functional.dropout(
+            getitem_3, p=0.05, training=True, inplace=False
+        )
+        dropout_3 = torch.nn.functional.dropout(
+            getitem_4, p=0.05, training=True, inplace=False
+        )
+        dropout_4 = torch.nn.functional.dropout(
+            getitem_5, p=0.05, training=True, inplace=False
+        )
+        return (dropout, dropout_1, dropout_2, dropout_3, dropout_4)
+
+
 class TestGroupBatchFusion(TestCase):
     def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
         if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
@@ -581,6 +613,24 @@ def test_math_op_fusion(self):
         self.assertTrue(torch.allclose(ref, res))
         counters.clear()
 
+    @requires_gpu()
+    @torch._inductor.config.patch(
+        pre_grad_fusion_options={
+            "normalization_pass": {},
+            "batch_dropout": {},
+        }
+    )
+    def test_batch_dropout_pre_grad_fusion(self):
+        counters.clear()
+        module = TestDropout(GPU_TYPE)
+        input = [torch.randn(10, 100, requires_grad=True, device=GPU_TYPE)]
+        traced = torch.compile(module)
+        module(*input)
+        traced(*input)
+        self.assertEqual(counters["inductor"]["normalization_pass"], 1)
+        self.assertEqual(counters["inductor"]["batch_dropout"], 1)
+        counters.clear()
+
 
 class TestBMMFusionModule(torch.nn.Module):
     def __init__(self) -> None:
diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py
index 32f492f9c4a54..31bf343f00b6c 100644
--- a/test/inductor/test_loop_ordering.py
+++ b/test/inductor/test_loop_ordering.py
@@ -614,6 +614,37 @@ def f(x):
         out, code = run_and_get_code(f, x)
         FileCheck().check_count("@triton.jit", 1, exactly=True).run(code[0])
 
+    def test_3dred_pw_2d_outer_red(self):
+        """
+        Test a pattern as follows. We have a 3d contiguous tensor [m, n, k] as input.
+        1. do reduction on the k dimension and get a [m, n] tensor
+        2. do a pointwise operation on this [m, n] tensor (and realize the computation)
+        3. do a outer reduction on the output of step 2 on the m dimension.
+
+        Each of these step generate a kernel before fusion.
+        Without any loop reorder, kernel 1 and kernel 2 will get fused. And kernel 3 will be separeate.
+
+        But if we reorder the loop for kernel 2, then kernel 2 will get fused with kernel 3.
+        And the fused kernel-2-3 can not be fused with kernel 1.
+
+        The older version of LOAF algorithm will do reorder in this case. But there is no real
+        benefits. There are even some slight downsides
+        1. the original fusion without loop reordering is more natural
+        2. fusion kernel 1 with kernel 2 may help precision when the output of kernel 1 is in low precision.
+           By fusion kernel 1 and kernel 2, the pointwise operation will operate on fp32 precision thanks
+           to fusion.
+        """
+        M, N, K = 64, 64, 64
+
+        def f(x):
+            x = x.sum(dim=-1)
+            x = x + 1  # can be more complex like sigmoid or other ops
+            return x, x.sum(dim=0)
+
+        x = torch.randn(M, N, K, device=GPU_TYPE)
+        self.do_acc_test(f, x)
+        self.assertEqual(0, metrics.num_loop_reordering)
+
 
 @inductor_config.patch(
     {
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index e719abc39b701..617cb8aafa8d5 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -27,7 +27,7 @@
     TuningProcessPool,
 )
 from torch._inductor.graph import GraphLowering
-from torch._inductor.ir import Buffer, ChoiceCaller, FixedLayout
+from torch._inductor.ir import Buffer, ChoiceCaller, FixedLayout, FlexibleLayout
 from torch._inductor.kernel.mm_plus_mm import aten_mm_plus_mm
 from torch._inductor.select_algorithm import (
     add_feedback_saver,
@@ -53,7 +53,11 @@
     skipIfRocmNotEnoughMemory,
 )
 from torch.testing._internal.logging_utils import multiple_logs_to_string
-from torch.utils._triton import has_triton_stable_tma_api, has_triton_tma_device
+from torch.utils._triton import (
+    has_datacenter_blackwell_tma_device,
+    has_triton_stable_tma_api,
+    has_triton_tma_device,
+)
 
 
 aten = torch.ops.aten
@@ -104,7 +108,8 @@ def benchmark(self, *args, out):
 @instantiate_parametrized_tests
 class TestMaxAutotune(TestCase):
     @parametrize("dynamic", (False, True))
-    def test_max_autotune_mm_plus_mm_zero_size_input(self, dynamic):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_max_autotune_mm_plus_mm_zero_size_input(self, dynamic, search_space):
         """
         Make sure autotuning mm_plus_mm with zero-size input works without crashes.
         """
@@ -118,7 +123,9 @@ def mm_plus_mm(a, b, c, d):
         c = torch.randn(m, k).to(GPU_TYPE)
         d = torch.randn(k, n).to(GPU_TYPE)
 
-        with config.patch({"max_autotune": True}):
+        with config.patch(
+            {"max_autotune": True, "max_autotune_gemm_search_space": search_space}
+        ):
             torch.compile(mm_plus_mm, dynamic=dynamic)(a, b, c, d)
 
     @unittest.skipIf(
@@ -127,11 +134,13 @@ def mm_plus_mm(a, b, c, d):
     @parametrize("a_transposed", (False, True))
     @parametrize("b_transposed", (False, True))
     @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
     def test_max_autotune_regular_mm_persistent_tma(
         self,
         a_transposed: bool,
         b_transposed: bool,
         dynamic: bool,
+        tma_store: bool,
     ):
         def mm(a, b):
             # TMA requires 16-byte alignment: here we repeat the dims
@@ -163,12 +172,35 @@ def mm(a, b):
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+                "triton.enable_template_tma_store": tma_store,
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
         ):
-            c_actual = torch.compile(mm, dynamic=dynamic)(a, b)
+            c_actual, code = run_and_get_code(torch.compile(mm, dynamic=dynamic), a, b)
             c_expected = mm(a, b)
 
+        if has_triton_stable_tma_api():
+            make_desc_api = "triton.language.make_tensor_descriptor"
+            read_api = "tl.load_tensor_descriptor"
+            if tma_store:
+                # Note: The tma_descriptor0 is generated by the kernel. If the
+                # code generation process changes this could change.
+                write_api = "tma_descriptor0.store"
+            else:
+                write_api = "tl.store"
+        else:
+            make_desc_api = (
+                "triton.language.extra.cuda.experimental_device_tensormap_create2d"
+            )
+            read_api = "tl._experimental_descriptor_load"
+            # TMA store is not supported with the experimental API
+            write_api = "tl.store"
+
+        # Verify that we are using a TMA implementation
+        FileCheck().check("triton_tem_fused_mm").check(make_desc_api).check(
+            read_api
+        ).check(write_api).run(code[0])
+
         torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
 
     @unittest.skipIf(
@@ -232,6 +264,75 @@ def next_multiple_16(a: int) -> int:
             check_str = "triton.language.make_tensor_descriptor"
         FileCheck().check("triton_tem_fused_mm").check(check_str).run(code[0])
 
+    @unittest.skipIf(
+        not has_datacenter_blackwell_tma_device(),
+        "Need Blackwell with device-side TMA support in Triton",
+    )
+    @parametrize("a_transposed", (False, True))
+    @parametrize("b_transposed", (False, True))
+    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
+    @parametrize("epilogue_subtile", (False, True))
+    def test_blackwell_max_autotune_regular_mm_persistent_tma(
+        self,
+        a_transposed: bool,
+        b_transposed: bool,
+        dynamic: bool,
+        tma_store: bool,
+        epilogue_subtile: bool,
+    ):
+        def mm(a, b):
+            # TMA requires 16-byte alignment: here we repeat the dims
+            # by the factor of 8, as float16 is 2-byte. All dims are
+            # repeated due to the possible transpositions below.
+            a = a.repeat(8, 8)
+            b = b.repeat(8, 8)
+            if a_transposed:
+                a = a.T
+            if b_transposed:
+                b = b.T
+
+            return torch.mm(a, b)
+
+        M, N, K = 32, 16, 48
+        a = (
+            torch.randn(*((K, M) if a_transposed else (M, K)))
+            .to(torch.float16)
+            .to(GPU_TYPE)
+        )
+        b = (
+            torch.randn(*((N, K) if b_transposed else (K, N)))
+            .to(torch.float16)
+            .to(GPU_TYPE)
+        )
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": True,
+                "triton.enable_template_tma_store": tma_store,
+                "triton.enable_epilogue_subtiling": epilogue_subtile,
+                "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
+            }
+        ):
+            c_actual, code = run_and_get_code(torch.compile(mm, dynamic=dynamic), a, b)
+            c_expected = mm(a, b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
+        write_count = 2 if epilogue_subtile else 1
+        if tma_store:
+            # Verify that we are using a TMA implementation
+            # Note: The tma_descriptor0 is generated by the kernel. If the
+            # code generation process changes this could change.
+            write_api = "tma_descriptor0.store"
+        else:
+            write_api = "tl.store"
+        FileCheck().check("triton_tem_fused_mm").check(
+            "triton.language.make_tensor_descriptor"
+        ).check("tl.load_tensor_descriptor").check_count(write_api, write_count).run(
+            code[0]
+        )
+
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
@@ -262,6 +363,42 @@ def mm(a, b):
         # given the config flags above, we should have no choices left.
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+    @unittest.skipIf(
+        not has_triton_tma_device(), "Need device-side TMA support in Triton"
+    )
+    @parametrize("dynamic", (False, True))
+    def test_max_autotune_regular_mm_persistent_tma_illegal_output_alignment(
+        self, dynamic
+    ):
+        def mm(a, b, out):
+            torch.mm(a, b, out=out)
+            return out
+
+        M, N, K = 21, 31, 32
+        a = torch.empty_strided((M, K), (K, 1), dtype=torch.float16, device=GPU_TYPE)
+        a[:] = torch.randn((M, K), dtype=torch.float16)
+        b = torch.empty_strided((K, N), (1, K), dtype=torch.float16, device=GPU_TYPE)
+        b[:] = torch.randn((K, N), dtype=torch.float16)
+        # allocate an output with a stride not divisble by 16, so it can't satisfy TMA alignment checks.
+        out = torch.empty_strided((M, N), (N, 1), dtype=torch.float16, device=GPU_TYPE)
+
+        with (
+            self.assertRaises(BackendCompilerFailed) as context,
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "triton.enable_persistent_tma_matmul": "1",
+                    "triton.enable_template_tma_store": True,
+                    "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
+                }
+            ),
+        ):
+            torch.compile(mm, dynamic=dynamic)(a, b, out)
+
+        # Lowering to the persistent+TMA Triton template should be skipped
+        # since the output doesn't have a stride of 1 in any dim
+        self.assertIn("NoValidChoicesError", str(context.exception))
+
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
@@ -309,17 +446,145 @@ def mm(a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
+    # NOTE: the current Inductor template verifies that the scaling mode is either per-tensor or per-row
+    # TODO: support additional scaling modes for Blackwell
+    @unittest.skipIf(
+        not has_datacenter_blackwell_tma_device(),
+        "Need Blackwell with device-side TMA support in Triton",
+    )
+    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
+    def test_blackwell_max_autotune_scaled_mm_per_tensor_persistent_tma(
+        self,
+        dynamic: bool,
+        tma_store: bool,
+    ):
+        def scaled_mm(a, b, scale_a, scale_b):
+            # NOTE: Inductor constrains a to be row_major and b to be col_major
+            return torch._scaled_mm(
+                a, b.t(), scale_a, scale_b, use_fast_accum=True, out_dtype=torch.float16
+            )
+
+        def get_scale_per_tensor(t):
+            scale = torch.finfo(torch.float8_e4m3fn).max / t.abs().max()
+            return scale.to(torch.float32)
+
+        # TMA requires 16-byte alignment: here we repeat the dims
+        # by the factor of 8, as float16 is 2-byte.
+        M, N, K = 32, 16, 48
+        a = (torch.randn((M, K)).to(torch.float16).to(GPU_TYPE)).repeat(8, 8)
+        b = (torch.randn((N, K)).to(torch.float16).to(GPU_TYPE)).repeat(8, 8)
+
+        scale_a = get_scale_per_tensor(a)
+        scale_b = get_scale_per_tensor(b)
+
+        a = a.to(torch.float8_e4m3fn)
+        b = b.to(torch.float8_e4m3fn)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": True,
+                "triton.enable_template_tma_store": tma_store,
+                "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
+            }
+        ):
+            c_actual, code = run_and_get_code(
+                torch.compile(scaled_mm, dynamic=dynamic), a, b, scale_a, scale_b
+            )
+            c_expected = scaled_mm(a, b, scale_a, scale_b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=0.5)
+        if tma_store:
+            # Verify that we are using a TMA implementation
+            # Note: The tma_descriptor0 is generated by the kernel. If the
+            # code generation process changes this could change.
+            write_api = "tma_descriptor0.store"
+        else:
+            write_api = "tl.store"
+        FileCheck().check("triton_tem_fused__scaled_mm").check(
+            "triton.language.make_tensor_descriptor"
+        ).check("tl.load_tensor_descriptor").check(write_api).run(code[0])
+
+    @unittest.skipIf(
+        not has_datacenter_blackwell_tma_device(),
+        "Need Blackwell with device-side TMA support in Triton",
+    )
+    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
+    def test_blackwell_max_autotune_scaled_mm_per_row_persistent_tma(
+        self,
+        dynamic: bool,
+        tma_store: bool,
+    ):
+        def scaled_mm(a, b, scale_a, scale_b):
+            # NOTE: Inductor constrains a to be row_major and b to be col_majo
+            return torch._scaled_mm(
+                a,
+                b.t(),
+                scale_a,
+                scale_b.t(),
+                use_fast_accum=True,
+                out_dtype=torch.bfloat16,
+            )
+
+        def get_scale_per_row(t):
+            scale = (
+                torch.finfo(torch.float8_e4m3fn).max
+                / t.abs().max(dim=1, keepdim=True).values
+            )
+            return scale.to(torch.float32)
+
+        # TMA requires 16-byte alignment: here we repeat the dims
+        # by the factor of 8, as float16 is 2-byte.
+        M, N, K = 32, 16, 48
+        a = (torch.randn((M, K)).to(torch.bfloat16).to(GPU_TYPE)).repeat(8, 8)
+        b = (torch.randn((N, K)).to(torch.bfloat16).to(GPU_TYPE)).repeat(8, 8)
+
+        scale_a = get_scale_per_row(a)
+        scale_b = get_scale_per_row(b)
+
+        a = a.to(torch.float8_e4m3fn)
+        b = b.to(torch.float8_e4m3fn)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": True,
+                "triton.enable_template_tma_store": tma_store,
+                "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
+            }
+        ):
+            c_actual, code = run_and_get_code(
+                torch.compile(scaled_mm, dynamic=dynamic), a, b, scale_a, scale_b
+            )
+            c_expected = scaled_mm(a, b, scale_a, scale_b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=0.5)
+        if tma_store:
+            # Verify that we are using a TMA implementation
+            # Note: The tma_descriptor0 is generated by the kernel. If the
+            # code generation process changes this could change.
+            write_api = "tma_descriptor0.store"
+        else:
+            write_api = "tl.store"
+        FileCheck().check("triton_tem_fused__scaled_mm").check(
+            "triton.language.make_tensor_descriptor"
+        ).check("tl.load_tensor_descriptor").check(write_api).run(code[0])
+
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
     @parametrize("a_transposed", (False, True))
     @parametrize("b_transposed", (False, True))
     @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
     def test_max_autotune_addmm_persistent_tma(
         self,
         a_transposed: bool,
         b_transposed: bool,
         dynamic: bool,
+        tma_store: bool,
     ):
         def addmm(x, a, b):
             # TMA requires 16-byte alignment: here we repeat the dims
@@ -353,12 +618,114 @@ def addmm(x, a, b):
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+                "triton.enable_template_tma_store": tma_store,
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
         ):
-            c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b)
+            c_actual, code = run_and_get_code(
+                torch.compile(addmm, dynamic=dynamic), x, a, b
+            )
+            c_expected = addmm(x, a, b)
+
+        if has_triton_stable_tma_api():
+            make_desc_api = "triton.language.make_tensor_descriptor"
+            read_api = "tl.load_tensor_descriptor"
+            if tma_store:
+                # Note: The tma_descriptor0 is generated by the kernel. If the
+                # code generation process changes this could change.
+                write_api = "tma_descriptor0.store"
+            else:
+                write_api = "tl.store"
+        else:
+            make_desc_api = (
+                "triton.language.extra.cuda.experimental_device_tensormap_create2d"
+            )
+            read_api = "tl._experimental_descriptor_load"
+            # TMA store is not supported with the experimental API
+            write_api = "tl.store"
+
+        # Verify that we are using a TMA implementation
+        FileCheck().check("triton_tem_fused_addmm").check(make_desc_api).check(
+            read_api
+        ).check(write_api).run(code[0])
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
+
+    @unittest.skipIf(
+        not has_datacenter_blackwell_tma_device(),
+        "Need Blackwell with device-side TMA support in Triton",
+    )
+    @parametrize("a_transposed", (False, True))
+    @parametrize("b_transposed", (False, True))
+    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
+    @parametrize("epilogue_subtile", (False, True))
+    def test_blackwell_max_autotune_addmm_persistent_tma(
+        self,
+        a_transposed: bool,
+        b_transposed: bool,
+        dynamic: bool,
+        tma_store: bool,
+        epilogue_subtile: bool,
+    ):
+        def addmm(x, a, b):
+            # TMA requires 16-byte alignment: here we repeat the dims
+            # by the factor of 8, as float16 is 2-byte. All dims are
+            # repeated due to the possible transpositions below.
+            x = x.repeat(8)
+            a = a.repeat(8, 8)
+            b = b.repeat(8, 8)
+
+            if a_transposed:
+                a = a.T
+            if b_transposed:
+                b = b.T
+
+            return torch.addmm(x, a, b)
+
+        M, N, K = 21, 31, 11
+        a = (
+            torch.randn(*((K, M) if a_transposed else (M, K)))
+            .to(torch.float16)
+            .to(GPU_TYPE)
+        )
+        b = (
+            torch.randn(*((N, K) if b_transposed else (K, N)))
+            .to(torch.float16)
+            .to(GPU_TYPE)
+        )
+        x = torch.randn(N).to(torch.float16).to(GPU_TYPE)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": True,
+                "triton.enable_template_tma_store": tma_store,
+                "triton.enable_epilogue_subtiling": epilogue_subtile,
+                "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
+            }
+        ):
+            c_actual, code = run_and_get_code(
+                torch.compile(addmm, dynamic=dynamic), x, a, b
+            )
             c_expected = addmm(x, a, b)
 
+        make_desc_api = "triton.language.make_tensor_descriptor"
+        read_api = "tl.load_tensor_descriptor"
+        write_count = 2 if epilogue_subtile else 1
+        if tma_store:
+            # Verify that we are using a TMA implementation
+            # Note: The tma_descriptor0 is generated by the kernel. If the
+            # code generation process changes this could change.
+            write_api = "tma_descriptor0.store"
+        else:
+            write_api = "tl.store"
+
+        # Verify that we are using a TMA implementation
+        FileCheck().check("triton_tem_fused_addmm").check(make_desc_api).check(
+            read_api
+        ).check_count(write_api, write_count).run(code[0])
+
         torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
 
     @unittest.skipIf(
@@ -541,7 +908,8 @@ def addmm(x, a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(addmm, dynamic=dynamic)(x, a, b)
 
-    def test_autotune_conv1x1(self):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_autotune_conv1x1(self, search_space):
         # Assuming input has 3 channels and we want to produce 16 channels as output
         conv1x1 = (
             torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=1)
@@ -558,7 +926,11 @@ def test_autotune_conv1x1(self):
         )
 
         with config.patch(
-            {"max_autotune": True, "max_autotune_gemm_backends": "TRITON"}
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+                "max_autotune_gemm_search_space": search_space,
+            }
         ):
 
             @torch.compile()
@@ -670,7 +1042,9 @@ def f(x, y):
         self.assertTrue(torch.allclose(act, ref, atol=4 * 1e-3, rtol=4 * 1e-3))
 
     @config.patch(max_autotune=True)
-    def test_empty_conv_input(self, kernel_size=3):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    @parametrize("kernel_size", (1, 3))
+    def test_empty_conv_input(self, search_space, kernel_size):
         x = torch.randn(0, 256, 14, 14, device=GPU_TYPE)
         weight = torch.randn(256, 256, kernel_size, kernel_size, device=GPU_TYPE)
 
@@ -687,17 +1061,18 @@ def f(x, weight):
                 groups=1,
             )
 
-        opt_f = torch.compile(f)
-        ref = f(x, weight)
-        act = opt_f(x, weight)
-        self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
-
-    @config.patch(max_autotune=True)
-    def test_empty_conv_input_with_1x1_kernel(self):
-        self.test_empty_conv_input(kernel_size=1)
+        with config.patch({"max_autotune_gemm_search_space": search_space}):
+            opt_f = torch.compile(f)
+            ref = f(x, weight)
+            act = opt_f(x, weight)
+            self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
 
+    @skipIfXpu(
+        msg="Fails on Intel XPU; see https://github.com/pytorch/pytorch/issues/161484"
+    )
     @config.patch(max_autotune_gemm_backends="TRITON")
-    def test_baddmm(self):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_baddmm(self, search_space):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -716,11 +1091,12 @@ def forward(self, x):
         )
         mod = M().to(GPU_TYPE)
 
-        m_c = torch.compile(mode="max-autotune")(mod)
-        out, code = run_and_get_code(m_c, x)
-        self.assertEqual(out, mod(x), atol=2e-3, rtol=2e-3)
+        with config.patch({"max_autotune_gemm_search_space": search_space}):
+            m_c = torch.compile(mode="max-autotune")(mod)
+            out, code = run_and_get_code(m_c, x)
+            self.assertEqual(out, mod(x), atol=2e-3, rtol=1e-3)
 
-        FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
+            FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
 
     @config.patch(max_autotune=True)
     def test_conv1x1_with_free_symbols(self):
@@ -855,7 +1231,8 @@ def test_cat_max_autotune_extern(self):
     def test_cat_max_autotune_triton(self):
         self._test_cat_max_autotune_impl(using_triton_mm=True)
 
-    def test_conv_cat(self):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_conv_cat(self, search_space):
         class ToyModel(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -867,24 +1244,28 @@ def forward(self, x):
                 x = self.conv(x)
                 return torch.cat((x, x + 1))
 
-        with torch.no_grad():
-            m = ToyModel().to(device=GPU_TYPE)
-            input_tensor = torch.randn(32, 3, 64, 64).to(device=GPU_TYPE)
+        with config.patch({"max_autotune_gemm_search_space": search_space}):
+            with torch.no_grad():
+                m = ToyModel().to(device=GPU_TYPE)
+                input_tensor = torch.randn(32, 3, 64, 64).to(device=GPU_TYPE)
 
-            # convolution is not currently plannable
-            m = torch.compile(m, mode="max-autotune-no-cudagraphs")
-            out, code = run_and_get_code(m, input_tensor)
-            self.assertEqual(out, m(input_tensor))
+                # convolution is not currently plannable
+                m = torch.compile(m, mode="max-autotune-no-cudagraphs")
+                out, code = run_and_get_code(m, input_tensor)
+                self.assertEqual(out, m(input_tensor))
 
-            if not TEST_WITH_ROCM:
-                FileCheck().check("def triton_poi_fused_add_cat_").run(code[0])
+                if not TEST_WITH_ROCM:
+                    FileCheck().check("def triton_poi_fused_add_cat_").run(code[0])
 
-    def test_conv3d(self):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_conv3d(self, search_space):
         fn = torch.nn.functional.conv3d
         image = torch.randn([1, 3, 8, 16, 32])
         filt = torch.randn([3, 3, 7, 7, 7])
 
-        with config.patch({"max_autotune": True}):
+        with config.patch(
+            {"max_autotune": True, "max_autotune_gemm_search_space": search_space}
+        ):
             expected = fn(image, filt)
             actual = torch.compile(fn)(image, filt)
             torch.testing.assert_close(actual, expected, atol=6e-5, rtol=0.001)
@@ -1499,7 +1880,7 @@ def test_triton_template_generated_code_cache_key(self):
         # Make sure all args of generate_and_load_args are passed to make_key_args (Except generate_with_caching)
         # update this function each time new arg added to generate_and_load and make sure arg is added to make_key
         self.assertEqual(generate_and_load_args - 1, make_key_args)
-        self.assertEqual(generate_and_load_args, 17)
+        self.assertEqual(generate_and_load_args, 18)
 
     @fresh_cache()
     @config.patch(
@@ -1585,9 +1966,9 @@ def func_test1(x, y, z, m):
                             "[[22,30],[30,1],torch.float32,device(type='cuda',index=0),0]"],
                         'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[10,30],
                         'layout':"[[10,30],[30,1],torch.float32,device(type='cuda',index=0),0]",
-                        'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity',
-                        'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
-                        'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""
+                        'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,
+                        'kwargs':{'EVEN_K':False,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
+                        'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8,'ALLOW_TF32':True},'hint_override':None}"""
 
                 expected = expected.replace("cuda", GPU_TYPE)
                 self.assertExpectedInline(
@@ -1625,8 +2006,8 @@ def func_test1(x, y, z, m):
                         "[[s27,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]"],
                     'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[s77,s94],
                     'layout':"[[s77,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]",'num_consumer_groups':0,
-                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','kwargs':{'EVEN_K':False,'ALLOW_TF32':True,
-                    'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""
+                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,'kwargs':{'EVEN_K':False,'USE_FAST_ACCUM':False,
+                    'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8,'ALLOW_TF32':True},'hint_override':None}"""
                 expected = expected.replace("cuda", GPU_TYPE)
                 self.assertExpectedInline(
                     remove_white_space(cache_key),
@@ -1978,6 +2359,42 @@ def choice_validator(choices):
         finally:
             clear_preprocessing_fns()
 
+    @config.patch(
+        {"test_configs.max_mm_configs": 4, "max_autotune_gemm_backends": "ATEN,TRITON"}
+    )
+    @parametrize("max_autotune_enabled", (True, False))
+    def test_autotune_layout_optimization(self, max_autotune_enabled):
+        """Test that layouts are flexible when every choice is ExternKernelChoice"""
+
+        # we use a proxy here of bias_addmm and max-autotune because this enables us to see
+        # multiple choices in both scenarios (bias_addmm, addmm, triton (max-autotune only))
+        # and both bias_addmm and addmm are extern kernel choices
+        def layout_checker(choices):
+            if choices:
+                expected_layout = (
+                    FixedLayout if max_autotune_enabled else FlexibleLayout
+                )
+                for choice in choices:
+                    self.assertIsInstance(
+                        choice.layout,
+                        expected_layout,
+                        f"Expected {expected_layout.__name__} with max_autotune={max_autotune_enabled}",
+                    )
+            return choices
+
+        add_preprocessing_fn(layout_checker)
+
+        try:
+            bias = torch.randn(64, device=GPU_TYPE)
+            x = torch.randn(32, 128, device=GPU_TYPE)
+            w = torch.randn(128, 64, device=GPU_TYPE)
+
+            with config.patch({"max_autotune": max_autotune_enabled}):
+                compiled_fn = torch.compile(lambda b, x, w: torch.addmm(b, x, w))
+                _ = compiled_fn(bias, x, w)
+        finally:
+            clear_preprocessing_fns(clear_defaults=False)
+
 
 class TestMaxAutotunePrecompile(TestCase):
     def test_precompilation_threads(self):
@@ -2215,8 +2632,9 @@ def mm(a, b):
         with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     @parametrize("dynamic", (False, True))
-    def test_max_autotune_addmm(self, dynamic=False):
+    def test_max_autotune_addmm(self, search_space, dynamic=False):
         """
         Make sure autotuning addmm in sub processes work without crashes.
         """
@@ -2229,7 +2647,13 @@ def addmm(x, a, b):
         x = torch.randn(100).to(GPU_TYPE)
         a = torch.randn(100, 10).to(GPU_TYPE)
         b = torch.randn(10, 100).to(GPU_TYPE)
-        with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_search_space": search_space,
+            }
+        ):
             Y_compiled = torch.compile(addmm, dynamic=dynamic)(x, a, b)
             Y = addmm(x, a, b)
             torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2)
diff --git a/test/inductor/test_memory.py b/test/inductor/test_memory.py
index 80372bca9fdca..bf994b5e6b847 100644
--- a/test/inductor/test_memory.py
+++ b/test/inductor/test_memory.py
@@ -353,6 +353,33 @@ def f(x, y, z):
         y = torch.rand(N, N, dtype=torch.float32, device=GPU_TYPE)
         z = torch.rand(N, N, dtype=torch.float32, device=GPU_TYPE)
 
+        from torch._inductor.choices import InductorChoices
+        from torch._inductor.scheduler import BaseSchedulerNode, Scheduler
+
+        class CustomInductorChoices(InductorChoices):
+            @staticmethod
+            def can_fuse(
+                scheduler: Scheduler,
+                node1: BaseSchedulerNode,
+                node2: BaseSchedulerNode,
+                shared_data_score: int,
+            ) -> bool:
+                can_fuse_default = InductorChoices.can_fuse(
+                    scheduler, node1, node2, shared_data_score
+                )
+                if (not can_fuse_default) or (
+                    not config.realize_acc_reads_size_threshold
+                ):
+                    return can_fuse_default
+
+                all_reads = (node1.read_writes.reads | node2.read_writes.reads) - (
+                    node1.read_writes.writes | node2.read_writes.writes
+                )
+                size_of_reads = [scheduler.dep_size_hint(dep) for dep in all_reads]
+                return sum(size_of_reads) < config.realize_acc_reads_size_threshold
+
+        torch._inductor.virtualized.V.set_choices_handler(CustomInductorChoices())
+
         # CASE 1: no restriction on the amount of accumulation
         with config.patch({"realize_acc_reads_size_threshold": float("inf")}):
             f_compiled = torch.compile(f)
@@ -381,13 +408,9 @@ def f(x, y, z):
             code = run_and_get_triton_code(f_compiled, x, y, z)
             (
                 FileCheck()
-                .check("triton_poi_fused_add_0.run(buf1, arg2_1,")
-                .check("triton_poi_fused_add_0.run(buf3, arg2_1,")
-                .check("triton_poi_fused_add_0.run(buf4, buf3,")
-                .check("triton_poi_fused_add_0.run(buf6, arg2_1,")
-                .check("triton_poi_fused_add_0.run(buf7, buf6,")
-                .check("triton_poi_fused_add_0.run(buf9, arg2_1,")
-                .check("triton_poi_fused_add_0.run(buf10, buf9,")
+                .check("triton_poi_fused_add_0.run(buf2, arg2_1, buf1,")
+                .check("triton_poi_fused_add_1.run(buf4, buf3, arg2_1")
+                .check("triton_poi_fused_add_1.run(buf6, buf5, arg2_1,")
                 .run(code)
             )
 
diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py
index 529fe0727028b..9f364fa08ec51 100644
--- a/test/inductor/test_mps_basic.py
+++ b/test/inductor/test_mps_basic.py
@@ -134,6 +134,13 @@ def test_reduced_max(self):
         # inductor test do not validate that max of say 16K half elements can be computed
         self.common(torch.max, (torch.rand(16384, dtype=torch.half),), check_lowp=False)
 
+    def test_linalg_inv(self):
+        def fn(x):
+            return torch.linalg.inv(torch.linalg.cholesky(x))
+
+        A = torch.diag(torch.tensor([20.0, 0.5, 5.0], dtype=torch.float32) ** 2)
+        self.common(fn, (A,), check_lowp=False)
+
 
 class MPSBasicTestsAOTI(TestCase):
     def check_model(self, m, inp, dynamic_shapes=None):
diff --git a/test/inductor/test_multi_kernel.py b/test/inductor/test_multi_kernel.py
index f576016cf08c5..a509586d80a4a 100644
--- a/test/inductor/test_multi_kernel.py
+++ b/test/inductor/test_multi_kernel.py
@@ -50,6 +50,16 @@ def _contains_multi_kernel_code(wrapper_code: str):
     )
 
 
+def _contains_size_hint_multi_kernel_code(wrapper_code: str):
+    return (
+        re.search(
+            r"multi_kernel_[^ ]* = async_compile.size_hint_multi_kernel[(]",
+            wrapper_code,
+        )
+        is not None
+    )
+
+
 def make_cpp_wrapper_test(orig_test, **extra_args):
     """
     Wrap an existing test into a new test with cpp-wrapper enabled.
@@ -115,6 +125,7 @@ def fn(x, y):
         )
         x = torch.randn(4096, 4096, device=GPU_TYPE)
         y = torch.randn(4096, 4096, device=GPU_TYPE)
+        torch._dynamo.mark_dynamic(x, 0)
         act, wrapper_code = run_and_get_code(compiled_fn, x, y)
         ref = fn(x, y)
 
@@ -123,7 +134,7 @@ def fn(x, y):
         # We mainly care about the wrapper for the final pass here.
         wrapper_code = wrapper_code[-1]
         self.assertEqual(ref, act)
-        self.assertTrue(_contains_multi_kernel_code(wrapper_code))
+        self.assertTrue(_contains_size_hint_multi_kernel_code(wrapper_code))
 
     @requires_triton()
     # TODO: bobrenjc93 to fix multi-kernel for ROCM
@@ -142,6 +153,7 @@ def fn(x, y):
         )
         x = torch.randn(4096, 4096, device=GPU_TYPE)
         y = torch.randn(4096, 4096, device=GPU_TYPE)
+        torch._dynamo.mark_dynamic(x, 0)
         act, wrapper_code = run_and_get_code(compiled_fn, x, y)
         ref = fn(x, y)
 
@@ -150,7 +162,7 @@ def fn(x, y):
         # We mainly care about the wrapper for the final pass here.
         wrapper_code = wrapper_code[-1]
         self.assertEqual(ref, act)
-        self.assertTrue(_contains_multi_kernel_code(wrapper_code))
+        self.assertTrue(_contains_size_hint_multi_kernel_code(wrapper_code))
 
     @parametrize("force_kernel", (0, 1))
     @unittest.mock.patch.dict(
diff --git a/test/inductor/test_op_dtype_prop.py b/test/inductor/test_op_dtype_prop.py
index 6f7eec601666b..f61f26288d0e5 100644
--- a/test/inductor/test_op_dtype_prop.py
+++ b/test/inductor/test_op_dtype_prop.py
@@ -204,6 +204,8 @@ def test_dtype_aware_codegen(self, op_name: str, load_upcast_to_fp32, input_dtyp
         # Edge case: torch.round maps to libdevice.nearbyint.
         triton_op_name_overrides = {
             "round": "nearbyint",
+            # torch.sqrt lowers to tl.sqrt_rn after switching away from libdevice.sqrt
+            "sqrt": "sqrt_rn",
         }
         override = triton_op_name_overrides.get(op_name)
         triton_op_name = override if override is not None else torch_op_name
diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py
index 9ef3a18e24234..c67bde87a369b 100644
--- a/test/inductor/test_padding.py
+++ b/test/inductor/test_padding.py
@@ -109,9 +109,6 @@ def setUpClass(cls):
         if HAS_GPU:
             cls.prior_float32_matmul_precision = torch.get_float32_matmul_precision()
             cls.prior_default_device = torch.get_default_device()
-            # In MI300, HIPBLASLT_ALLOW_TF32=1 is used to enable tf32 for matmul.
-            # In the current test, HIPBLASLT_ALLOW_TF32 is not set, according to the
-            # logic of allowTF32CuBLAS(), set float32_matmul_precision to highest.
             if torch.version.hip:
                 torch.set_float32_matmul_precision("highest")
             else:
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 83cd236875f45..2dd6d498936fe 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -1156,11 +1156,13 @@ def f():
                 torch.compile(f, fullgraph=True),
             )
 
-            # Check that we are allocating the minimum number of intermediate buffers
+            # Check that we are not allocate intermediate buffers
+            # which can be reused.
             matches = re.findall(r"empty_strided_\w+\(", code)
-            self.assertEqual(len(matches), 1)
+            self.assertEqual(len(matches), 0)
+            self.assertEqual("in_out" in code, True)
 
-            self.assertExpectedInline(count_numel(f), """39""")
+            self.assertExpectedInline(count_numel(f), """45""")
 
     @requires_cuda_and_triton
     def test_inplace_triton_kernel_v1(self):
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
index b30cdc2d946c1..101ed60526b76 100644
--- a/test/inductor/test_select_algorithm.py
+++ b/test/inductor/test_select_algorithm.py
@@ -393,6 +393,31 @@ def foo(x, w, b):
         # Autotuning checks correctness of each version
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
+    @patches
+    @torch._inductor.config.patch(conv_1x1_as_mm=False)
+    def test_convolution2_group(self):
+        @torch.compile
+        def foo(x, w, b):
+            return aten.convolution(
+                x,
+                w,
+                b,
+                stride=(1, 1),
+                padding=(1, 1),
+                dilation=(1, 1),
+                transposed=False,
+                output_padding=(0, 0),
+                groups=32,  # group is not 1
+            )
+
+        foo(
+            torch.randn(1, 32, 16, 16, device=GPU_TYPE),
+            torch.randn(32, 1, 3, 3, device=GPU_TYPE),
+            torch.randn(32, device=GPU_TYPE),
+        )
+        # Autotuning checks correctness of each version
+        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
     def test_TritonTemplateCaller_str(self):
         """
         Make sure str(TritonTemplateCaller) does not raise exceptions.
diff --git a/test/inductor/test_snode_runtime.py b/test/inductor/test_snode_runtime.py
index c57393d993eab..cee78592153db 100644
--- a/test/inductor/test_snode_runtime.py
+++ b/test/inductor/test_snode_runtime.py
@@ -258,8 +258,6 @@ def _verify_runtime_estimation(self, fn, inps):
         finally:
             dist.destroy_process_group()
 
-    # lack of profiler on XPU
-    @expectedFailureXPU
     def test_legacy_all_reduce(self):
         def fn(x):
             r = c10d.all_reduce(x, "sum", "", self.RANKS, self.WORLD_SIZE)
@@ -268,8 +266,6 @@ def fn(x):
         inp = T(10, 10)
         self._verify_runtime_estimation(fn, (inp,))
 
-    # lack of profiler on XPU
-    @expectedFailureXPU
     def test_legacy_all_reduce_coalesced(self):
         def fn(x):
             rs = c10d.all_reduce_coalesced(x, "sum", "", self.RANKS, self.WORLD_SIZE)
@@ -278,8 +274,6 @@ def fn(x):
         inp = [T(10, 10), T(15, 15)]
         self._verify_runtime_estimation(fn, (inp,))
 
-    # lack of profiler on XPU
-    @expectedFailureXPU
     def test_legacy_all_gather_into_tensor_coalesced(self):
         def fn(x):
             rs = c10d.all_gather_into_tensor_coalesced(
@@ -293,8 +287,6 @@ def fn(x):
         inp = [T(10, 10), T(15, 15)]
         self._verify_runtime_estimation(fn, (inp,))
 
-    # lack of profiler on XPU
-    @expectedFailureXPU
     def test_all_reduce(self):
         def fn(x):
             r = _c10d.all_reduce(x, "sum", "0")
@@ -303,8 +295,6 @@ def fn(x):
         inp = T(10, 10)
         self._verify_runtime_estimation(fn, (inp,))
 
-    # lack of profiler on XPU
-    @expectedFailureXPU
     def test_all_reduce_coalesced(self):
         def fn(x):
             rs = _c10d.all_reduce_coalesced(x, "sum", "0")
@@ -313,8 +303,6 @@ def fn(x):
         inp = [T(10, 10), T(15, 15)]
         self._verify_runtime_estimation(fn, (inp,))
 
-    # lack of profiler on XPU
-    @expectedFailureXPU
     def test_all_gather_into_tensor(self):
         def fn(x):
             rs = _c10d.all_gather_into_tensor(
@@ -327,8 +315,6 @@ def fn(x):
         inp = T(10, 10)
         self._verify_runtime_estimation(fn, (inp,))
 
-    # lack of profiler on XPU
-    @expectedFailureXPU
     def test_all_gather_into_tensor_coalesced(self):
         def fn(x):
             rs = _c10d.all_gather_into_tensor_coalesced(
@@ -341,8 +327,6 @@ def fn(x):
         inp = [T(10, 10), T(15, 15)]
         self._verify_runtime_estimation(fn, (inp,))
 
-    # lack of profiler on XPU
-    @expectedFailureXPU
     def test_reduce_scatter_tensor(self):
         def fn(x):
             rs = _c10d.reduce_scatter_tensor(
@@ -356,8 +340,6 @@ def fn(x):
         inp = T(self.WORLD_SIZE, 10)
         self._verify_runtime_estimation(fn, (inp,))
 
-    # lack of profiler on XPU
-    @expectedFailureXPU
     def test_reduce_scatter_tensor_coalesced(self):
         def fn(x):
             rs = _c10d.reduce_scatter_tensor_coalesced(
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 6bae97110a1b8..208f361f3727a 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1455,6 +1455,39 @@ def fn(a, b):
                 code.count("view_dtype" if config.cpp_wrapper else "aten.view"), 3
             )
 
+    def test_add_complex_strided_fallback(self):
+        @torch.compile
+        def fn(a, b):
+            return a + b
+
+        if not self.is_dtype_supported(torch.complex64):
+            raise unittest.SkipTest("complex64 not supported on device")
+
+        base = torch.randn(3, 4, dtype=torch.complex64, device=self.device)
+        x = base.transpose(0, 1)
+        y = base.transpose(0, 1)
+
+        torch._inductor.metrics.reset()
+        _, code = run_and_get_code(fn, x, y)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
+
+        code = " ".join(code)
+        fallback_markers = [
+            "extern_kernels.add",
+            "torch.ops.aten.add.Tensor",
+        ]
+        if config.cpp_wrapper:
+            fallback_markers.extend(
+                [
+                    "aoti_torch_cuda_add_Tensor",
+                    "aoti_torch_cpu_add_Tensor",
+                ]
+            )
+        self.assertTrue(
+            any(code.count(marker) >= 1 for marker in fallback_markers),
+            msg=f"Expected complex add with strided inputs to fall back to extern kernels, got:\n{code}",
+        )
+
     def test_add_complex5(self):
         def fn(a, b, alpha):
             return torch.add(a, b, alpha=alpha)
@@ -2427,7 +2460,6 @@ def fn(a):
         self.common(fn, [packed])
 
     @xfail_if_mps_unimplemented
-    @skipCUDAIf(True, "No _weight_int8pack_mm implementation on CUDA")
     @skipIfXpu(msg="No _weight_int8pack_mm implementation on XPU")
     def test_int8_weight_only_quant(self):
         def convert_weight_to_int8pack(b):
@@ -5477,6 +5509,28 @@ def fn(x):
 
         self.common(fn, (torch.randn(1, 1),))
 
+    def test_as_strided_on_views(self):
+        # https://github.com/pytorch/pytorch/issues/163286
+        def fn(a):
+            c = a.view(-1)
+            # convert to float16
+            d = c.view(torch.float16)
+            e = d.as_strided((2, 5), (1, 1))
+            # convert back to bfloat16
+            f = e.view(torch.bfloat16)
+            g = f.as_strided((10, 10), (1, 1))
+            return g
+
+        a = torch.randn(10, 10, dtype=torch.bfloat16)
+        self.common(fn, (a,), reference_in_float=False)
+
+        # test dtype separately
+        out = fn(a)
+        assert out.dtype == torch.bfloat16
+
+        out = torch.compile(fn)(a)
+        assert out.dtype == torch.bfloat16
+
     def test_repeat_interleave(self):
         def fn(x):
             return (
@@ -5627,7 +5681,6 @@ def fn(x):
             (torch.randn([2, 4, 4, 8]),),
         )
 
-    @xfail_if_mps_unimplemented
     def test_embedding_bag(self):
         def fn(w, i, o):
             return aten._embedding_bag(w, i, o, False, 0, False, None)
@@ -5876,6 +5929,16 @@ def f(a):
             a = torch.rand((1, 1000000), device=self.device)
             self.common(f, (a,))
 
+    def test_inplace_flip(self):
+        def f(x, y):
+            x.copy_(x.flip(1))
+            y = y.sum(dim=1, keepdim=True) + y
+            return x + y
+
+        x = torch.randn(20, 1024 * 1024)
+        y = torch.randn(20, 1024 * 1024)
+        self.common(f, (x, y), atol=1e-3, rtol=1e-3)
+
     def test_gather_scatter(self):
         def fn(node_feat, edge_index):
             src_node_feat = node_feat[edge_index[0]]
@@ -7373,6 +7436,56 @@ def fn(a, b):
             rtol=1.3e-06,
         )
 
+    @requires_gpu()
+    def test_grid_sampler_expand_preserves_view(self):
+        if not self.device.startswith("cuda"):
+            self.skipTest("requires CUDA")
+
+        torch.manual_seed(0)
+        torch._dynamo.reset()
+
+        repeats = 9000
+        batch = 48
+        channels = 3
+        img = 224
+        grid_size = 13
+        device = self.device
+
+        class ExpandGridSampler(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.grid = torch.nn.Parameter(
+                    torch.randn(repeats, grid_size, grid_size, 2, device=device)
+                )
+                self.fc = torch.nn.Linear(grid_size * grid_size * channels, 16).to(
+                    device
+                )
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                per_channel = []
+                for i in range(channels):
+                    channel = x[:, i, ...].expand(repeats, -1, -1, -1)
+                    patch = torch.nn.functional.grid_sample(
+                        channel,
+                        self.grid,
+                        mode="bilinear",
+                        align_corners=False,
+                        padding_mode="border",
+                    )
+                    patch = patch.transpose(0, 1).flatten(start_dim=2)
+                    per_channel.append(patch)
+                x = torch.cat(per_channel, dim=2)
+                return self.fc(x)
+
+        model = ExpandGridSampler().to(device)
+        compiled = torch.compile(model, backend="inductor")
+        inp = torch.randn(batch, channels, img, img, device=device)
+
+        out = compiled(inp)
+        out.sum().backward()
+
+        self.assertIsNotNone(model.grid.grad)
+
     def test_upsample_bicubic2d(self):
         def fn(a):
             return (
@@ -10706,6 +10819,44 @@ def override(x):
 
         self.assertEqual(no_override(x_small), override(x_small))
 
+    @requires_gpu()
+    @skip_if_not_triton
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    @config.patch({"force_disable_caches": True})
+    def test_mark_unbacked_with_hint_override(self):
+        @torch.compile
+        def no_override(x):
+            return x.sum(dim=0)
+
+        @torch.compile
+        def override(x):
+            return x.sum(dim=0)
+
+        @torch.compile(fullgraph=True)
+        def branching(x):
+            if x.shape[0] > 4096:
+                return 1
+            return 2
+
+        x_small = torch.randn(4096, 512, device=GPU_TYPE)
+        torch._dynamo.decorators.mark_unbacked(x_small, 0)
+        code1 = run_and_get_triton_code(no_override, x_small)
+
+        torch._dynamo.reset_code_caches()
+
+        torch._dynamo.decorators.mark_unbacked(x_small, 0, hint_override=4096 * 10)
+        code2 = run_and_get_triton_code(override, x_small)
+        self.assertNotEqual(code1, code2)
+
+        self.assertEqual(no_override(x_small), override(x_small))
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Could not guard on data-dependent expression"
+        ):
+            branching(x_small)
+
     @requires_gpu()
     def test_stride_preservation_with_stride_modifying_fx_pass(self):
         def f(x):
@@ -14054,6 +14205,20 @@ def fn(a, b):
         with self.assertRaises(RuntimeError):
             compiled = torch.compile(fn, backend="inductor")(a, b)
 
+    @requires_cuda_and_triton
+    @config.patch(emulate_precision_casts=True)
+    def test_emulate_precision_triton_fp_fusion(self):
+        def fn(a, b):
+            return 2.001 * a + b
+
+        a = torch.full([256], 0.5001, device=GPU_TYPE, dtype=torch.float16)
+        b = torch.full([256], -1, device=GPU_TYPE, dtype=torch.float16)
+
+        compiled = torch.compile(fn)
+        out, (code,) = run_and_get_code(compiled, a, b)
+        self.assertTrue("'enable_fp_fusion': False" in code)
+        torch.testing.assert_close(out, fn(a, b), atol=0, rtol=0)
+
     # end of class CommonTemplate - add new tests here
 
 
@@ -14105,7 +14270,7 @@ def add_test_failures(
     """
     In-place modifies the given dictionary of `test_failures` to add the
     contents of `added_test_failures` by unioning the test_failure.suffixes, and
-    or-ing the the is_skip value.
+    or-ing the is_skip value.
     """
     for name, new_failure in added_test_failures.items():
         if name in test_failures:
@@ -15507,6 +15672,43 @@ def fn():
 
             fn()
 
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_default_device_context(self):
+            @torch.library.custom_op(
+                "mylib::cg_unsafe_op",
+                mutates_args=[],
+                schema="(Tensor x) -> Tensor",
+                device_types=GPU_TYPE,
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def cg_unsafe_op(x) -> torch.Tensor:
+                return x + 1
+
+            @cg_unsafe_op.register_fake
+            def _(x) -> torch.Tensor:
+                return torch.empty_like(x)
+
+            def f(x):
+                x += 1
+                y = cg_unsafe_op(x)
+                y += 1
+                return y
+
+            f = torch.compile(f, mode="reduce-overhead")
+
+            inp = torch.randn(2, device=GPU_TYPE)
+            _, (code,) = run_and_get_code(f, inp)
+
+            if config.cpp_wrapper:
+                FileCheck().check_count(
+                    "AOTICudaGuard device_guard(0)", 1, exactly=True
+                ).run(code)
+            else:
+                FileCheck().check_count(
+                    "with torch.cuda._DeviceGuard(0)", 1, exactly=True
+                ).run(code)
+
     class RNNTest(TestCase):
         device_type = GPU_TYPE
 
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
index 4bcdf0d0cddcf..e45c51becde36 100644
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -104,6 +104,9 @@ def run(*ex, **kwargs):
         ("cpu", "cuda", "xpu"), is_skip=True
     ),
     "test_to_device_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True),
+    "test_as_strided_on_views_dynamic_shapes": TestFailure(
+        ("cpu", "cuda", "xpu"), is_skip=True
+    ),
     #
     # Failed to find dynamic for loop variable:
     #
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index 9a3ef60156849..f9d49b2cdc21c 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -12,6 +12,7 @@
 import torch.library
 from torch._dynamo.testing import CompileCounterWithBackend, make_test_cls_with_patches
 from torch._inductor import metrics
+from torch._inductor.choices import InductorChoices
 from torch._inductor.codegen.wrapper import PythonWrapperCodegen
 from torch._inductor.test_case import TestCase
 from torch._inductor.utils import run_and_get_code
@@ -1087,6 +1088,70 @@ def fn(x, y):
         self.assertEqual(fn(x, 4.0), fn_opt(x, 4.0))
         self.assertEqual(cnt.frame_count, 2)
 
+    @onlyOn(GPU_TYPE)
+    def test_dynamic_rblock_bounds(self):
+        class ForcePersistent(InductorChoices):
+            @staticmethod
+            def should_use_cooperative_reduction(*args, **kwargs) -> bool:
+                return False
+
+            @staticmethod
+            def should_use_persistent_reduction(*args, **kwargs) -> bool:
+                return True
+
+        def fn(x):
+            return x.sum()
+
+        x = torch.rand([31], device=GPU_TYPE)
+
+        with V.set_choices_handler(ForcePersistent()):
+            torch._dynamo.mark_dynamic(x, 0, min=1, max=62)
+            fn_c = torch.compile(fn)
+            actual, source_codes = run_and_get_code(fn_c, x)
+            self.assertEqual(fn(x), actual)
+            FileCheck().check("R0_BLOCK: tl.constexpr = 64").run(source_codes[0])
+            torch._dynamo.reset()
+
+            torch._dynamo.mark_dynamic(x, 2, min=1, max=64)
+            fn_c = torch.compile(fn)
+            actual, source_codes = run_and_get_code(fn_c, x)
+            self.assertEqual(fn(x), actual)
+            FileCheck().check("R0_BLOCK: tl.constexpr = 64").run(source_codes[0])
+
+    def test_non_persistent_dynamic_rblock(self):
+        def reduce_bounded(x, y):
+            """Reduce over a dimension with bounded size."""
+            # x shape: [batch, features, reduction_dim]
+            # reduction_dim is dynamic but bounded to max 128
+            assert x.shape[2] <= 64, f"Reduction dim {x.shape[2]} exceeds max 128"
+
+            # Perform reduction (sum) over the last dimension
+            result = torch.sum(x * y, dim=2)
+            return result
+
+        # Create tensors where reduction dimension is 6 (but could be up to 128)
+        batch = 256
+        features = 5536
+        reduction_dim = 6  # Actual size is small
+
+        x = torch.randn(reduction_dim, batch, features, device=GPU_TYPE).permute(
+            1, 2, 0
+        )
+        y = torch.randn(reduction_dim, batch, features, device=GPU_TYPE).permute(
+            1, 2, 0
+        )
+
+        torch._dynamo.mark_dynamic(x, 2, min=6, max=64)
+        torch._dynamo.mark_dynamic(y, 2, min=6, max=64)
+
+        compiled_fn = torch.compile(reduce_bounded)
+        result, source_codes = run_and_get_code(compiled_fn, x, y)
+
+        FileCheck().check_not("@triton_heuristics.persistent").run(source_codes[0])
+        expected = reduce_bounded(x, y)
+
+        assert torch.allclose(result, expected, atol=1e-3, rtol=1e-3)
+
     def test_unspecialized_float_dynamic(self):
         def fn(x, y):
             return x * y
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
index 41db6b18daba7..9b4da470b6204 100644
--- a/test/inductor/test_torchinductor_strided_blocks.py
+++ b/test/inductor/test_torchinductor_strided_blocks.py
@@ -1104,6 +1104,8 @@ def foo(x, length):
     # bernoulli operation
     # TODO: fails for triton CPU "Failed to convert to LLVM IR"
     @test_torchinductor.xfail_if_triton_cpu
+    # Disable split_reductions on this test for now due to the interaction with LOAF
+    @config.patch(split_reductions=False)
     def test_removed_buffers(self):
         from torch.ops import aten
 
@@ -1114,8 +1116,8 @@ def fn(a):
         result, code = self._run_and_compare(
             fn,
             *[torch.ones(200, 200, device=self.device) * p],
-            expected_num_triton_kernels=2,
-            expected_num_block_pointers=3,
+            expected_num_triton_kernels=1,
+            expected_num_block_pointers=1,
             atol=p * 0.06,
             rtol=0.06,
         )
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index 5fe3623b271a5..9a21220ce4d9e 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -4,6 +4,7 @@
 # Skip do not assign a lambda expression, use a def
 import functools
 import logging
+import os
 
 import torch
 import torch._dynamo.testing
@@ -1280,8 +1281,11 @@ def f(x):
         self.assertEqual(compiled_out, eager_out)
 
     @requires_gpu
+    @common_utils.parametrize("dump_launch_params", ["0", "1"])
     @common_utils.parametrize("dynamic", [False, True])
-    def test_triton_kernel_equal_to_1_arg(self, dynamic):
+    def test_triton_kernel_equal_to_1_arg(self, dynamic, dump_launch_params):
+        os.environ["TORCHINDUCTOR_DUMP_LAUNCH_PARAMS"] = dump_launch_params
+
         @triton.jit
         def add_kernel_half_n_elements(
             in_ptr0,
@@ -2563,6 +2567,52 @@ def fn(inp):
         expected = torch.compile(fn, fullgraph=True)(inp)
         self.assertEqual(actual, expected)
 
+    @requires_gpu
+    @inductor_config.patch("emulate_precision_casts", True)
+    def test_triton_kernel_emulate_precision_unaffected(self):
+        @triton.jit
+        def triton_(in_ptr, out_ptr, numel, add_amount, BLOCK_SIZE: tl.constexpr):
+            offsets = tl.arange(0, BLOCK_SIZE)
+            x = tl.load(in_ptr + offsets, mask=(offsets < numel))
+            output = x * x
+            if add_amount is not None:
+                output = output + add_amount
+            tl.store(out_ptr + offsets, output, mask=(offsets < numel))
+
+        def fn(x):
+            y = torch.empty_like(x)
+            BLOCK_SIZE = 256
+            grid = (1,)
+            triton_[grid](x, y, x.numel(), None, BLOCK_SIZE)
+            return y
+
+        t1 = torch.rand(5, device=GPU_TYPE)
+        fn = torch.compile(fn)
+        _, (code,) = run_and_get_code(fn, t1)
+        self.assertTrue("enable_fp_fusion" not in code)
+
+    @requires_gpu
+    @inductor_config.patch("emulate_precision_casts", True)
+    @inductor_config.patch("max_autotune_gemm_backends", "TRITON")
+    def test_triton_kernel_emulate_precision_mm_kernels_do_not_change(self):
+        from torch._inductor.utils import run_and_get_code
+
+        @torch.compile(mode="max-autotune")
+        def fn(a, b):
+            return a @ b
+
+        t1 = torch.rand(512, 512, device=GPU_TYPE)
+        t2 = torch.rand(512, 512, device=GPU_TYPE)
+        try:
+            _, (code,) = run_and_get_code(fn, t1, t2)
+            self.assertTrue("enable_fp_fusion" not in code)
+        except Exception as e:
+            if "NoValidChoicesError" in str(e):
+                raise unittest.SkipTest(
+                    "where inductor has no triton mm kernels available, this test is meaningless"
+                ) from e
+            raise
+
 
 def make_mutation_test(fn):
     @requires_gpu
diff --git a/test/inductor/test_unbacked_symints.py b/test/inductor/test_unbacked_symints.py
index cca1cb6a6dabb..b41886a03dd90 100644
--- a/test/inductor/test_unbacked_symints.py
+++ b/test/inductor/test_unbacked_symints.py
@@ -489,6 +489,22 @@ def fn(q, k, vector, scalar):
         expected = fn(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+    @skipGPUIf(not HAS_GPU, "requires gpu and triton")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    def test_softmax(self, device):
+        def fn(x):
+            nz = x.nonzero().float()
+            soft = torch.softmax(nz, dim=0)
+            logsoft = torch.nn.functional.log_softmax(nz, dim=0)
+            return soft * logsoft
+
+        example_inputs = (
+            torch.randint(low=0, high=2, size=(32,), device=device, dtype=torch.int8),
+        )
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
     @skipGPUIf(not HAS_GPU, "requires gpu and triton")
     @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet")
     @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
@@ -564,6 +580,33 @@ def fn(x):
         expected = fn(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+    @skipGPUIf(not HAS_GPU, "requires gpu and triton")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    @inductor_config.patch({"combo_kernels": True, "benchmark_combo_kernel": True})
+    def test_combo_kernel_size_hint_failure(self, device):
+        # A size hint failure is "TypeError: Cannot convert symbols to int"
+        if device == "cpu":
+            raise unittest.SkipTest("Combo kernels must be for GPU.")
+
+        def fn(x):
+            nz = torch.nonzero(x)
+            u0 = nz.size(0)
+            t1 = torch.ones(u0, device=device)
+            t2 = torch.zeros(u0 + 1, device=device)
+            t3 = torch.zeros(u0 * 2, device=device)
+            t4 = torch.zeros(u0 - x.size(0), device=device)
+            out1 = t1 - 1
+            out2 = t2 + 2
+            out3 = t3 * 3
+            out4 = t4 / 4
+            return out1, out2, out3, out4
+
+        example_inputs = (torch.randn(32, device=device, dtype=torch.float16),)
+        torch._dynamo.mark_dynamic(example_inputs[0], 0)
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
 
 instantiate_device_type_tests(TestUnbackedSymints, globals(), allow_xpu=True)
 
diff --git a/test/inductor/test_utils.py b/test/inductor/test_utils.py
index 0fb1a8dcf3222..9300bf1bae8c9 100644
--- a/test/inductor/test_utils.py
+++ b/test/inductor/test_utils.py
@@ -131,7 +131,7 @@ def create_fx_node(
                 (
                     torch.ops.aten.convolution,
                     (
-                        torch.Tensor(2, 3, 3),
+                        torch.Tensor(2, 2, 3),
                         torch.Tensor(2, 2, 2),
                         torch.Tensor(2),
                         (1, 1),
diff --git a/test/inductor_expected_failures/TestTorch.test_storage_cycle_via_slots b/test/inductor_expected_failures/TestTorch.test_storage_cycle_via_slots
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/inductor_expected_failures/TestTorch.test_storage_finalizer_dealloc b/test/inductor_expected_failures/TestTorch.test_storage_finalizer_dealloc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/inductor_expected_failures/TestTorch.test_storage_slot_dealloc b/test/inductor_expected_failures/TestTorch.test_storage_slot_dealloc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/inductor_expected_failures/TestTorch.test_tensor_cycle_via_slots b/test/inductor_expected_failures/TestTorch.test_tensor_cycle_via_slots
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/inductor_expected_failures/TestTorch.test_tensor_finalizer_dealloc b/test/inductor_expected_failures/TestTorch.test_tensor_finalizer_dealloc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/inductor_expected_failures/TestTorch.test_tensor_slot_dealloc b/test/inductor_expected_failures/TestTorch.test_tensor_slot_dealloc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/jit/test_hash.py b/test/jit/test_hash.py
index 21c99a8a426c8..764110d46dd16 100644
--- a/test/jit/test_hash.py
+++ b/test/jit/test_hash.py
@@ -73,10 +73,6 @@ def fn(f1: float, f2: float):
         self.checkScript(fn, (1.2345, float("inf")))
         self.checkScript(fn, (float("inf"), float("inf")))
         self.checkScript(fn, (1.2345, float("nan")))
-        if sys.version_info < (3, 10):
-            # Hash of two nans are not guaranteed to be equal. From https://docs.python.org/3/whatsnew/3.10.html :
-            # Hashes of NaN values of both float type and decimal.Decimal type now depend on object identity.
-            self.checkScript(fn, (float("nan"), float("nan")))
         self.checkScript(fn, (float("nan"), float("inf")))
 
     def test_hash_int(self):
diff --git a/test/jit/test_union_pep604.py b/test/jit/test_union_pep604.py
index 0cd2ce33165a3..953ce52c49786 100644
--- a/test/jit/test_union_pep604.py
+++ b/test/jit/test_union_pep604.py
@@ -20,7 +20,6 @@
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
-@unittest.skipIf(sys.version_info < (3, 10), "Requires Python 3.10")
 class TestUnion(JitTestCase):
     """
     This class tests the functionality of `Union`.
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index c13ed12ad5304..dceb10c1cb566 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -30,7 +30,6 @@
     skipCUDAIfMiopen,
     skipCUDAIfNoCudnn,
     skipCUDAIfNoMiopen,
-    skipCUDAIfNotMiopenSuggestNHWC,
     skipCUDAIfRocm,
     skipMeta,
     skipMPS,
@@ -51,9 +50,6 @@
     parametrize as parametrize_test,
     run_tests,
     set_default_dtype,
-    skipIfRocm,
-    skipIfNotMiopenSuggestNHWC,
-    skipIfRocmVersionLessThan,
     subtest,
     TEST_SCIPY,
     TEST_WITH_ROCM,
@@ -65,6 +61,7 @@
 
 if TEST_WITH_ROCM:
     os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM"] = "1"
 
 
 if TEST_SCIPY:
@@ -716,7 +713,6 @@ def test_ConvTranspose2d_half_cublas_gemm(self):
     # Almost identical to the above `test_Conv2d_naive_groups`
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
     @tf32_on_and_off(0.001)
-    @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7")
     def test_Conv2d_groups_nobias(self):
         dev_dtypes = [("cpu", torch.float)]
         if TEST_CUDA:
@@ -762,7 +758,6 @@ def test_Conv2d_groups_nobias(self):
     # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
     @tf32_on_and_off(0.001)
-    @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7")
     def test_Conv2d_groups_nobias_v2(self):
         torch.manual_seed(123)
         dev_dtypes = [("cpu", torch.float)]
@@ -897,7 +892,6 @@ def test_conv_tbc(self):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
-    @skipIfNotMiopenSuggestNHWC
     def test_grouped_conv_cudnn_nhwc_support(self):
         # in order to catch the hols in grouped convolution in nhwc support for earlier cudnn version
         input = torch.randn((16, 16, 8, 8), dtype=torch.float16, device="cuda").to(
@@ -3147,7 +3141,6 @@ def test_conv_noncontig_weights_and_bias(self, device):
 
     @onlyCUDA
     @largeTensorTest("12GB")
-    @skipIfRocmVersionLessThan((6, 0))
     def test_conv_transposed_large(self, device):
         dtype = torch.half if self.device_type == "cuda" else torch.float
         conv = nn.ConvTranspose2d(1, 1, 1, 1, bias=False).to(device).to(dtype)
@@ -3191,7 +3184,6 @@ def test_conv_transposed_large(self, device):
             self.assertEqual(maxdiff3, 0)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @largeTensorTest("12GB")
     def test_conv_large(self, device):
         dtype = torch.half if self.device_type == "cuda" else torch.float
@@ -3224,7 +3216,6 @@ def test_conv_large(self, device):
         self.assertEqual(grad1, grad2, atol=5e-2, rtol=5e-3)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @largeTensorTest("20GB", "cpu")
     @largeTensorTest("60GB", "cuda")
     def test_conv_large_batch_1(self, device):
@@ -3361,7 +3352,6 @@ def test_ConvTranspose3d_size_1_kernel(self, device):
     @dtypes(torch.float)
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
     @tf32_on_and_off(0.001)
-    @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7")
     def test_Conv2d_naive_groups(self, device, dtype):
         # Check that grouped convolutions matches two half convolutions
         m = nn.Conv2d(4, 4, kernel_size=3, groups=2).to(device, dtype)
@@ -3630,19 +3620,21 @@ def helper(
                 )
 
     @onlyCUDA
-    @skipCUDAIfNotMiopenSuggestNHWC
     @dtypes(torch.half, torch.float, torch.cfloat)
     def test_conv_cudnn_nhwc(self, device, dtype):
         def helper(n, c, h, w, out_channels, kernel_size, groups):
-            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device).to(
-                memory_format=torch.channels_last
-            )
+            # randint with dtype=torch.cfloat fails with
+            # RuntimeError: check_random_bounds handles only integral, floating-point and boolean types
+            # must create randint and randint_like using default int64, then cast to desired
+            input = torch.randint(
+                -3, 3, (n, c, h, w), dtype=torch.int64, device=device
+            ).to(dtype, memory_format=torch.channels_last)
             input.requires_grad_()
             conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups).to(
                 device="cuda", dtype=dtype, memory_format=torch.channels_last
             )
             for p in conv.parameters():
-                p.data = torch.randint_like(p, -3, 3)
+                p.data = torch.randint_like(p, -3, 3, dtype=torch.int64).to(p.dtype)
 
             # use FP64 channels-first conv as reference
             ref_input = input.detach().clone().contiguous().double().requires_grad_()
@@ -3656,7 +3648,7 @@ def helper(n, c, h, w, out_channels, kernel_size, groups):
             out = conv(input)
             ref_out = ref_conv(ref_input)
 
-            grad = torch.randint_like(out, -3, 3)
+            grad = torch.randint_like(out, -3, 3, dtype=torch.int64).to(out.dtype)
             ref_grad = grad.detach().clone().double().contiguous()
 
             out.backward(grad)
@@ -3683,7 +3675,6 @@ def helper(n, c, h, w, out_channels, kernel_size, groups):
         helper(1, 16, 56, 56, out_channels=16, kernel_size=3, groups=16)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.half, torch.float)
     def test_conv_cudnn_ndhwc(self, device, dtype):
         def helper(n, c, d, h, w, out_channels, kernel_size, groups):
@@ -3813,7 +3804,6 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
                     )
 
     @onlyCUDA
-    @skipCUDAIfNotMiopenSuggestNHWC
     @tf32_on_and_off(0.05)
     def test_conv_cudnn_mismatch_memory_format(self, device):
         configs = [
@@ -3833,12 +3823,9 @@ def test_conv_cudnn_mismatch_memory_format(self, device):
                 nn.ConvTranspose2d, n, c, h, w, k, filter_size, device
             )
 
-    # torch.half is erroring out on Windows with CUDA 10.1 + cuDNN 7.6.4
-    # returning CUDNN_STATUS_BAD_PARAM
-    # Disabling that specific test for now [see issue # 33918]
     @onlyCUDA
     @skipCUDAIfNoCudnn
-    @dtypes(torch.float, torch.double)
+    @dtypes(torch.float, torch.double, torch.float16, torch.bfloat16)
     def test_conv_cudnn_nhwc_support(self, device, dtype):
         input = torch.randn(
             (1, 16, 1, 1), dtype=dtype, device="cuda", requires_grad=True
@@ -3875,6 +3862,7 @@ def test_conv2d_no_grad(self, device, dtype):
     @onlyCUDA
     @skipCUDAIfNoCudnn
     @dtypes(torch.float, torch.float16)
+    @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
     @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
     def test_cudnn_convolution_relu(self, device, dtype):
         for batch, groups, image_size, kernel_size, memory_format in product(
@@ -3908,6 +3896,7 @@ def test_cudnn_convolution_relu(self, device, dtype):
     @onlyCUDA
     @skipCUDAIfNoCudnn
     @dtypes(torch.float, torch.float16)
+    @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
     @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
     def test_cudnn_convolution_add_relu(self, device, dtype):
         for batch, groups, image_size, kernel_size, memory_format in product(
@@ -3946,7 +3935,6 @@ def test_cudnn_convolution_add_relu(self, device, dtype):
                 self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_convert_conv2d_weight_memory_format(self, device):
         input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device=device)
         model = nn.Sequential(nn.Conv2d(8, 4, 3), nn.BatchNorm2d(4)).to(device).float()
@@ -3966,7 +3954,6 @@ def test_convert_conv2d_weight_memory_format(self, device):
             self.assertTrue(out.is_contiguous(memory_format=memory_format))
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_convert_conv3d_weight_memory_format(self, device):
         input = torch.randint(
             1, 10, (2, 8, 4, 4, 4), dtype=torch.float32, device=device
@@ -4040,10 +4027,9 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
 
     @skipCUDAIfRocm
     @onlyCUDA
-    @largeTensorTest('40GB')
-    @largeTensorTest('24GB', 'cpu')
-    # Skipped for ROCm temp - https://ontrack-internal.amd.com/browse/SWDEV-383635
-    @skipIfRocm
+    @largeTensorTest("40GB")
+    @largeTensorTest("24GB", "cpu")
+    @tf32_on_and_off(0.005)
     def test_conv3d_64bit_indexing(self, device):
         x = torch.rand(1, 32, 512, 512, 256)
         m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
diff --git a/test/nn/test_parametrization.py b/test/nn/test_parametrization.py
index eb1f7c982b7ca..da83ed26caaca 100644
--- a/test/nn/test_parametrization.py
+++ b/test/nn/test_parametrization.py
@@ -593,6 +593,22 @@ def right_inverse(self, w):
             parametrize.register_parametrization(module, "weight", ChangeDtypeInverse())
         self.assertFalse(parametrize.is_parametrized(module))
 
+        class ChangeDeviceInverse(nn.Module):
+            def forward(self, x):
+                return x.float()
+
+            def right_inverse(self, w):
+                return w.to(torch.device("meta"))
+
+        # For parametrizations that return one tensor, right_inverse may not change the device
+        with self.assertRaisesRegex(
+            ValueError, "outputs one tensor, it may not change the device"
+        ):
+            parametrize.register_parametrization(
+                module, "weight", ChangeDeviceInverse()
+            )
+        self.assertFalse(parametrize.is_parametrized(module))
+
         # Doesn't return a tensor
         class NotTensor(nn.Module):
             def forward(self, x):
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index a8f77df22d311..d282a885f4ed5 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -662,6 +662,31 @@ def test_FractionalMaxPool3d_errors(self, device):
             nn.FractionalMaxPool3d(
                 [0, 0, 0], output_size=[1, 1, 1], _random_samples=samples
             )
+        samples = torch.randn(1, 3, 10, 10, 10)
+        with self.assertRaisesRegex(RuntimeError, "too large relative to"):
+            nn.FractionalMaxPool3d(
+                kernel_size=9223372036854775803,
+                output_size=[1, 1, 1],
+            )(samples)
+        with self.assertRaisesRegex(ValueError, "kernel_size must greater than 0"):
+            nn.FractionalMaxPool3d(
+                kernel_size=-1,
+                output_size=[1, 1, 1],
+            )(samples)
+
+    @onlyNativeDeviceTypes
+    def test_MaxPool3d_errors(self, device):
+        samples = torch.randn(1, 3, 10, 10, 10)
+        with self.assertRaisesRegex(RuntimeError, "integer out of range"):
+            nn.MaxPool3d(
+                kernel_size=9223372036854775803,
+            )(samples)
+        with self.assertRaisesRegex(
+            RuntimeError, "kernel size should be greater than zero"
+        ):
+            nn.MaxPool3d(
+                kernel_size=-1,
+            )(samples)
 
     @onlyNativeDeviceTypes
     def test_MaxPool_zero_batch_dim(self, device):
@@ -832,6 +857,20 @@ def test_MaxUnpool_index_errors(
             else:
                 unpool(output, indices)
 
+    # https://github.com/pytorch/pytorch/issues/163409
+    @onlyNativeDeviceTypes
+    def test_MaxUnpool_invalid_output_size(self, device):
+        input2d = torch.randn(1, 1, 1)
+        input3d = torch.randn(1, 1, 1, 1, 1)
+        unpool2d = torch.nn.MaxUnpool2d(())
+        unpool3d = torch.nn.MaxUnpool3d(())
+
+        with self.assertRaisesRegex(RuntimeError, "There should be exactly"):
+            unpool2d(input2d, torch.zeros_like(input2d, dtype=torch.int64))
+
+        with self.assertRaisesRegex(RuntimeError, "There should be exactly"):
+            unpool3d(input3d, torch.zeros_like(input3d, dtype=torch.int64))
+
     @expectedFailureMPS
     @onlyNativeDeviceTypes
     def test_AdaptiveMaxPool_zero_batch_dim(self, device):
@@ -1386,6 +1425,33 @@ def test_max_pool2d_with_indices_backward_fails(self, device):
                 indices,
             )
 
+    def test_max_unpool_invalid_indices(self):
+        input = torch.randn(1, 1, 2, 2)
+        negative_indices = torch.tensor([[[[-1, 0], [0, 2]]]], dtype=torch.int64)
+        large_indices = torch.tensor([[[[10000, 10], [0, 2]]]], dtype=torch.int64)
+        output_size = (2, 2)
+
+        with self.assertRaisesRegex(RuntimeError, "Found an invalid max index"):
+            F.max_unpool2d(input, negative_indices, output_size)
+
+        with self.assertRaisesRegex(RuntimeError, "Found an invalid max index"):
+            F.max_unpool2d(input, large_indices, output_size)
+
+        input = torch.randn(1, 1, 2, 2, 2)
+        negative_indices = torch.tensor(
+            [[[[[-1, 10], [0, 2]], [[1, 3], [4, 5]]]]], dtype=torch.int64
+        )
+        large_indices = torch.tensor(
+            [[[[[10000, 10], [0, 2]], [[1, 3], [4, 5]]]]], dtype=torch.int64
+        )
+        output_size = (2, 2, 2)
+
+        with self.assertRaisesRegex(RuntimeError, "Found an invalid max index"):
+            F.max_unpool3d(input, negative_indices, output_size)
+
+        with self.assertRaisesRegex(RuntimeError, "Found an invalid max index"):
+            F.max_unpool3d(input, large_indices, output_size)
+
     @onlyCPU
     @dtypes(torch.half, torch.bfloat16)
     def test_avg_pool2d_reduced_floating(self, device, dtype):
diff --git a/test/onnx/exporter/test_small_models_e2e.py b/test/onnx/exporter/test_small_models_e2e.py
index c5dd4132f5763..54c7d74489a36 100644
--- a/test/onnx/exporter/test_small_models_e2e.py
+++ b/test/onnx/exporter/test_small_models_e2e.py
@@ -806,6 +806,36 @@ def forward(self, x):
         # Test with reference evaluator because ORT does not support the op as of version 1.22
         onnx_testing.assert_onnx_program(onnx_program, backend="reference")
 
+    def test_enable_gqa_in_attention_23_with_dropout(self):
+        class Model(torch.nn.Module):
+            def forward(self, q, k, v):
+                return torch.nn.functional.scaled_dot_product_attention(  # pylint: disable=not-callable
+                    q, k, v, enable_gqa=True, dropout_p=0.1
+                )
+
+        model = Model()
+
+        query = torch.randn(2, 4, 8, 16)
+        key = torch.randn(2, 2, 8, 16)
+        value = torch.randn(2, 2, 8, 16)
+
+        onnx_program = self.export(
+            model,
+            (
+                query,
+                key,
+                value,
+            ),
+            opset_version=23,
+        )
+        # opset23 only uses manually gqa path when dropout is enabled,
+        # and dropout makes the output non-deterministic,
+        # so we check for the presence of the ops used in that path.
+        all_ops = [node.op_type for node in onnx_program.model.graph]
+        self.assertIn("Unsqueeze", all_ops)
+        self.assertIn("Expand", all_ops)
+        self.assertIn("Reshape", all_ops)
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/ops/test_ops.py b/test/onnx/ops/test_ops.py
index 437c74e9bfbfd..3736b930900f1 100644
--- a/test/onnx/ops/test_ops.py
+++ b/test/onnx/ops/test_ops.py
@@ -4,13 +4,20 @@
 from __future__ import annotations
 
 import onnx_ir.passes.common as common_passes
+import onnxruntime
 from onnxscript import ir
+from packaging import version
 
 import torch
+from torch.onnx._internal.exporter import _testing as onnx_testing
 from torch.onnx.ops import _impl, _symbolic_impl
 from torch.testing._internal import common_utils
 
 
+def has_onnxruntime_opset_23() -> bool:
+    return version.parse(onnxruntime.__version__) >= version.parse("1.23")
+
+
 class SchemaTest(common_utils.TestCase):
     def test_symbolic_has_correct_schema(self):
         torch.library.opcheck(
@@ -432,7 +439,7 @@ def export(self, model, args=(), kwargs=None, **options) -> torch.onnx.ONNXProgr
 
     def test_onnx_ops_can_be_decomposed_to_aten(self):
         input_data = torch.rand(2, 3, 4, 8)
-        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+        position_ids_data = torch.randint(0, 50, (2, 4)).long()
         sin_cache_data = torch.rand(50, 4)
         cos_cache_data = torch.rand(50, 4)
 
@@ -473,7 +480,7 @@ def forward(
 
     def test_rotary_embedding_opcheck(self):
         input_data = torch.rand(2, 3, 4, 8)
-        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+        position_ids_data = torch.randint(0, 50, (2, 4)).long()
         sin_cache_data = torch.rand(50, 4)
         cos_cache_data = torch.rand(50, 4)
 
@@ -484,7 +491,7 @@ def test_rotary_embedding_opcheck(self):
 
     def test_rotary_embedding(self):
         input_data = torch.rand(2, 3, 4, 8)
-        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+        position_ids_data = torch.randint(0, 50, (2, 4)).long()
         sin_cache_data = torch.rand(50, 4)
         cos_cache_data = torch.rand(50, 4)
 
@@ -525,6 +532,49 @@ def forward(
         )
         self.assertEqual(onnx_program.model.opset_imports[""], 23)
         self.assertEqual("RotaryEmbedding", onnx_program.model.graph.node(0).op_type)
+        if has_onnxruntime_opset_23():
+            onnx_testing.assert_onnx_program(onnx_program)
+        else:
+            # Test with reference evaluator because ORT does not support the op as of version 1.22
+            onnx_testing.assert_onnx_program(onnx_program, backend="reference")
+
+    def test_rotary_embedding_3d(self):
+        num_heads = 2
+        input_data = torch.rand(2, 3, 8)
+        sin_cache_data = torch.rand(2, 3, 2)
+        cos_cache_data = torch.rand(2, 3, 2)
+
+        class Model(torch.nn.Module):
+            def forward(self, input_data, cos_cache_data, sin_cache_data):
+                return torch.onnx.ops.rotary_embedding(
+                    input_data,
+                    cos_cache_data,
+                    sin_cache_data,
+                    num_heads=num_heads,
+                )
+
+        model = Model()
+
+        # Dynamic shapes are supported
+        dynamic_shapes = {
+            "input_data": {0: torch.export.Dim.DYNAMIC},
+            "cos_cache_data": {0: torch.export.Dim.DYNAMIC},
+            "sin_cache_data": {0: torch.export.Dim.DYNAMIC},
+        }
+
+        onnx_program = self.export(
+            model,
+            (input_data, cos_cache_data, sin_cache_data),
+            dynamic_shapes=dynamic_shapes,
+            opset_version=23,
+        )
+        self.assertEqual(onnx_program.model.opset_imports[""], 23)
+        self.assertEqual("RotaryEmbedding", onnx_program.model.graph.node(0).op_type)
+        if has_onnxruntime_opset_23():
+            onnx_testing.assert_onnx_program(onnx_program)
+        else:
+            # Test with reference evaluator because ORT does not support the op as of version 1.22
+            onnx_testing.assert_onnx_program(onnx_program, backend="reference")
 
     def test_attention_basic(self):
         """Test basic attention functionality."""
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index c36e7b2e21d62..fbc9e69177791 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -369,6 +369,16 @@ def test_get_last_lr_multi_step_lr(self):
         scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
         self._test_get_last_lr(scheduler, targets, epochs)
 
+    def test_raise_error_when_last_epoch_is_greater_than_0_and_initial_lr_is_not_specified(
+        self,
+    ):
+        optimizer = SGD([Parameter(torch.randn(2, 2, requires_grad=True))], 0.1)
+        with self.assertRaisesRegex(
+            KeyError,
+            r"param \'initial_lr\' is not specified in param_groups\[0\] when resuming scheduler with last_epoch >= 0",
+        ):
+            StepLR(optimizer, step_size=3, gamma=0.1, last_epoch=1)
+
     def test_multi_step_lr(self):
         # lr = 0.05     if epoch < 2
         # lr = 0.005    if 2 <= epoch < 5
@@ -700,6 +710,15 @@ def test_reduce_lr_on_plateau_get_last_lr_before_step(self):
             scheduler.get_last_lr(), [0.5 for param_group in self.opt.param_groups]
         )
 
+    def test_reduce_lr_on_plateau_preserves_lr_type(self):
+        # Ensures that tensor lrs are preserved, preventing recompilations.
+        types = [type(group["lr"]) for group in self.opt.param_groups]
+        scheduler = ReduceLROnPlateau(self.opt, mode="min", patience=0)
+        scheduler.step(1.0)
+        scheduler.step(2.0)  # Triggers scheduler._reduce_lr
+        for group, type_ in zip(self.opt.param_groups, types):
+            self.assertEqual(type(group["lr"]), type_)
+
     def test_sequentiallr1(self):
         epochs = 19
         schedulers = [None] * 2
@@ -812,6 +831,27 @@ def test_get_last_lr_sequentiallr(self):
         targets = [single_targets, [x * 10 for x in single_targets]]
         self._test_get_last_lr(scheduler, targets, epochs)
 
+    def test_sequentiallr_does_not_alias_lr_and_initial_lr(self):
+        # The TestLRScheduler object uses self.opt to avoid instantiating a new optimizer for each test.
+        # self.opt has a float lr, and we need to use a Tensor lr to ensure that a former SequentialLR bug is fixed.
+        # For more context, see https://github.com/pytorch/pytorch/issues/162359
+        old_opt = self.opt
+        lr = torch.tensor(2.0)
+        self.opt = SGD(self.net.parameters(), lr=lr)
+        milestone = 4
+        epochs = 8
+        start, end = 0.1, 0.8
+
+        schedulers = [
+            LinearLR(self.opt, start, end, total_iters=milestone),
+            LinearLR(self.opt, end, start, total_iters=epochs - milestone),
+        ]
+        targets = [[0.2, 0.55, 0.9, 1.25, 1.6, 1.25, 0.9, 0.55]]
+
+        scheduler = SequentialLR(self.opt, schedulers, milestones=[milestone])
+        self._test(scheduler, targets, epochs)
+        self.opt = old_opt
+
     def test_chained_lr2_get_last_lr_before_step(self):
         schedulers = [
             LinearLR(self.opt, start_factor=0.4, total_iters=3),
@@ -2402,6 +2442,7 @@ def test_cosine_then_cyclic(self):
             partial(CyclicLR, base_lr=0.01, max_lr=0.1),
             partial(OneCycleLR, max_lr=0.01, total_steps=10, anneal_strategy="linear"),
             partial(CosineAnnealingWarmRestarts, T_0=20),
+            partial(SWALR, swa_lr=0.01),
         ],
     )
     @parametrize("weights_only", [True, False])
diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 5351f147cf33f..4d7a97bdc5fa2 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -1492,7 +1492,7 @@ def id_for_testing(key):
             for _, action, (key, version), size in prof._memory_profile().timeline
             # We generally don't care about tiny allocations during memory
             # profiling and they add a lot of noise to the unit test.
-            if size > 1024
+            if size > 1024 and isinstance(key, _memory_profiler.TensorKey)
         ]
 
         self.assertExpectedInline(
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 46b21cb4dc097..07feddf8383b7 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -1764,25 +1764,27 @@ def test_profiler_op_event_kwargs(self):
             with open(fname) as f:
                 j = json.load(f)
                 op_events = [
-                    e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op"
+                    e
+                    for e in j["traceEvents"]
+                    if e.get("name", "") == "add_test_kwinputs"
                 ]
+                self.assertTrue(len(op_events) > 0)
                 for e in op_events:
-                    if e["name"] == "add_test_kwinputs":
-                        # print(e["args"])
-                        args = e["args"]
-                        self.assertTrue("stream" in args)
-                        self.assertTrue("grid" in args)
-                        self.assertTrue("boolean" in args)
-                        self.assertTrue(args["stream"] == 0)
-                        self.assertTrue(args["grid"] == "lambda x : x + 1")
-                        self.assertTrue(args["debug"] == "None")
-                        self.assertTrue(args["boolean"])
+                    args = e["args"]
+                    self.assertTrue("stream" in args)
+                    self.assertTrue("grid" in args)
+                    self.assertTrue("boolean" in args)
+                    self.assertTrue(args["stream"] == 0)
+                    self.assertTrue(args["grid"] == "lambda x : x + 1")
+                    self.assertTrue(args["debug"] == "None")
+                    self.assertTrue(args["boolean"])
+                    self.assertTrue(e["cat"] == "cpu_op")
 
         with profile(record_shapes=True) as p1:
             cm = torch._C._profiler._RecordFunctionFast(
                 "add_test_kwinputs",
                 [x, y],
-                {"stream": "test", "grid": [1, 2]},
+                {"stream": "test", "grid": [1, 2], "scope": "user_scope"},
             )
             for _ in range(4):
                 with cm:
@@ -1792,14 +1794,92 @@ def test_profiler_op_event_kwargs(self):
             with open(fname1) as f1:
                 j = json.load(f1)
                 op_events = [
-                    e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op"
+                    e
+                    for e in j["traceEvents"]
+                    if e.get("name", "") == "add_test_kwinputs"
                 ]
+                self.assertTrue(len(op_events) > 0)
                 for e in op_events:
-                    if e["name"] == "add_test_kwinputs":
-                        # print(e["args"])
-                        args = e["args"]
-                        self.assertTrue("stream" not in args)
-                        self.assertTrue("grid" not in args)
+                    args = e["args"]
+                    self.assertTrue("stream" not in args)
+                    self.assertTrue("grid" not in args)
+                    self.assertTrue(e["cat"] == "user_annotation")
+
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    def test_profiler_op_event_kwargs_list_of_strings(self):
+        x, y = (torch.rand((4, 4)) for _ in range(2))
+        with profile(record_shapes=True) as p:
+            cm = torch._C._profiler._RecordFunctionFast(
+                "add_test_kwinputs_string_list",
+                [x, y],
+                {
+                    "string_list": ["hello", "world", "test"],
+                    "int_param": 42,
+                    "string_param": "single_string",
+                },
+            )
+            for _ in range(4):
+                with cm:
+                    x.add(y)
+        with TemporaryFileName(mode="w+") as fname:
+            p.export_chrome_trace(fname)
+            with open(fname) as f:
+                j = json.load(f)
+                op_events = [
+                    e
+                    for e in j["traceEvents"]
+                    if e.get("name", "") == "add_test_kwinputs_string_list"
+                ]
+                self.assertTrue(len(op_events) > 0)
+                for e in op_events:
+                    args = e["args"]
+                    self.assertTrue("string_list" in args)
+                    self.assertTrue("int_param" in args)
+                    self.assertTrue("string_param" in args)
+                    # Check that the list of strings is properly serialized
+                    # The list should be formatted as a JSON array by ivalueListToStr
+                    self.assertEqual(args["string_list"], ["hello", "world", "test"])
+                    self.assertEqual(args["int_param"], 42)
+                    self.assertEqual(args["string_param"], "single_string")
+                    self.assertTrue(e["cat"] == "cpu_op")
+
+        # Test mixed types that should be filtered out
+        with profile(record_shapes=True) as p1:
+            cm = torch._C._profiler._RecordFunctionFast(
+                "add_test_kwinputs_string_list_filtered",
+                [x, y],
+                {
+                    "valid_string_list": ["valid1", "valid2"],
+                    "mixed_list": ["string", 123],  # Should be filtered out
+                    "non_string_list": [1, 2, 3],  # Should be filtered out
+                    "valid_int": 100,
+                },
+            )
+            for _ in range(4):
+                with cm:
+                    x.add(y)
+        with TemporaryFileName(mode="w+") as fname1:
+            p1.export_chrome_trace(fname1)
+            with open(fname1) as f1:
+                j = json.load(f1)
+                op_events = [
+                    e
+                    for e in j["traceEvents"]
+                    if e.get("name", "") == "add_test_kwinputs_string_list_filtered"
+                ]
+                self.assertTrue(len(op_events) > 0)
+                for e in op_events:
+                    args = e["args"]
+                    # Only valid types should be present
+                    self.assertTrue("valid_string_list" in args)
+                    self.assertTrue("valid_int" in args)
+                    # Invalid lists should be filtered out
+                    self.assertTrue("mixed_list" not in args)
+                    self.assertTrue("non_string_list" not in args)
+                    # Check values
+                    self.assertEqual(args["valid_string_list"], ["valid1", "valid2"])
+                    self.assertEqual(args["valid_int"], 100)
+                    self.assertTrue(e["cat"] == "cpu_op")
 
     def test_is_profiler_enabled(self):
         self.assertFalse(torch.autograd.profiler._is_profiler_enabled)
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 95c08a356424d..329266d7c63b5 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -908,6 +908,19 @@ def test_quant_min_max_override(self):
         self.assertEqual(fq_module.activation_post_process.quant_min, 0)
         self.assertEqual(fq_module.activation_post_process.quant_max, 127)
 
+    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']))
+    def test_fused_moving_avg_obs_fake_quant(self, device):
+        try:
+            sampled_dtype = st.sampled_from(["bf16", "fp32"]) if device == "cuda" else "fp32"
+            dtype = torch.bfloat16 if sampled_dtype == "bf16" else torch.float32
+            torch.set_default_dtype(dtype)
+
+            with torch.device(device):
+                fake_quantize = FusedMovingAvgObsFakeQuantize()
+                fake_quantize.forward(torch.rand((256, 512)))
+        finally:
+            torch.set_default_dtype(torch.float32)
+
 def _get_buffer_ids(module):
     """
     Object addresses stay constant if and only if all modifications are in-place
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index 4cf34ac8c6c84..93ea3e9f57b23 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -3,6 +3,7 @@
 
 import torch
 import math
+from typing import Union
 from torch.ao.quantization import (
     FakeQuantize,
     MovingAverageMinMaxObserver,
@@ -155,18 +156,25 @@ def _fake_quantize_learnable_per_channel_affine_grad_reference(
 
 def _get_tensor_min_max(
         X: torch.Tensor,
-        running_min: float = float("inf"),
-        running_max: float = float("-inf"),
-        averaging_const: float = 0.01) -> tuple[float, float]:
-    min_val = X.min().to(dtype=torch.float32).item()
-    max_val = X.max().to(dtype=torch.float32).item()
-
-    if not math.isinf(running_min):
-        min_val = running_min + averaging_const * (min_val - running_min)
-    if not math.isinf(running_max):
-        max_val = running_max + averaging_const * (max_val - running_max)
-
-    return min_val, max_val
+        running_min: Union[float, torch.Tensor] = float("inf"),
+        running_max: Union[float, torch.Tensor] = float("-inf"),
+        averaging_const: float = 0.01,
+        dtype: torch.dtype = torch.float32) -> tuple[float, float]:
+    min_val_tensor = X.min().to(dtype=dtype)
+    max_val_tensor = X.max().to(dtype=dtype)
+    averaging_const_tensor = torch.tensor(averaging_const, dtype=dtype).item()
+
+    if not isinstance(running_min, torch.Tensor):
+        running_min = torch.tensor(running_min, dtype=dtype)
+    if not isinstance(running_max, torch.Tensor):
+        running_max = torch.tensor(running_max, dtype=dtype)
+
+    if not torch.isinf(running_min):
+        min_val_tensor = running_min + averaging_const_tensor * (min_val_tensor - running_min)
+    if not torch.isinf(running_max):
+        max_val_tensor = running_max + averaging_const_tensor * (max_val_tensor - running_max)
+
+    return min_val_tensor.item(), max_val_tensor.item()
 
 def _get_per_row_min_max(
         x: torch.Tensor,
@@ -1064,10 +1072,13 @@ def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant, use_bool
         Tests the case where we call the fused_obs_fake_quant op multiple times
         and update the running_min and max of the activation tensors.
         """
-        in_running_min_ref = out_running_min_ref = float("inf")
-        in_running_min_op = torch.tensor(float("inf"), device=device)
-        in_running_max_ref = out_running_max_ref = float("-inf")
-        in_running_max_op = torch.tensor(float("-inf"), device=device)
+        sampled_dtype = st.sampled_from(["bf16", "fp32"]) if device == "cuda" else "fp32"
+        dtype = torch.bfloat16 if sampled_dtype == "bf16" else torch.float32
+
+        in_running_min_ref = out_running_min_ref = torch.tensor(float("inf"), dtype=dtype)
+        in_running_min_op = torch.tensor(float("inf"), dtype=dtype, device=device)
+        in_running_max_ref = out_running_max_ref = torch.tensor(float("-inf"), dtype=dtype)
+        in_running_max_op = torch.tensor(float("-inf"), dtype=dtype, device=device)
         avg_const = 0.01
         scale = torch.tensor([1.0], device=device)
         zero_point = torch.tensor([0], dtype=torch.int, device=device)
@@ -1080,8 +1091,7 @@ def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant, use_bool
                 observer_on = True if use_bool else 1
             if i > 4:
                 fake_quant_on = True if use_bool else 1
-
-            x = torch.randn(5, 5, device=device)
+            x = torch.randn(5, 5, dtype=dtype, device=device)
             out = pt_op(
                 x,
                 torch.tensor(observer_on, device=device),
@@ -1106,6 +1116,7 @@ def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant, use_bool
                     running_min=in_running_min_ref,
                     running_max=in_running_max_ref,
                     averaging_const=0.01,
+                    dtype=dtype,
                 )
 
             if fake_quant_on:
@@ -1128,7 +1139,7 @@ def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant, use_bool
             torch.testing.assert_close(out, x_in)
 
         # Test empty input works
-        x = torch.empty(0, 5, device=device)
+        x = torch.empty(0, 5, dtype=dtype, device=device)
         out = pt_op(
             x,
             torch.tensor(1, device=device),
diff --git a/test/quantization/pt2e/test_duplicate_dq.py b/test/quantization/pt2e/test_duplicate_dq.py
index b830763d4ba86..a4f75588f48c2 100644
--- a/test/quantization/pt2e/test_duplicate_dq.py
+++ b/test/quantization/pt2e/test_duplicate_dq.py
@@ -24,7 +24,7 @@
     OP_TO_ANNOTATOR,
     QuantizationConfig,
 )
-from torch.export import export_for_training
+from torch.export import export
 from torch.testing._internal.common_quantization import QuantizationTestCase
 from torch.testing._internal.common_utils import IS_WINDOWS, raise_on_run_directly
 
@@ -101,7 +101,7 @@ def _test_duplicate_dq(
 
         # program capture
         m = copy.deepcopy(m_eager)
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
 
         m = prepare_pt2e(m, quantizer)
         # Calibrate
diff --git a/test/quantization/pt2e/test_metadata_porting.py b/test/quantization/pt2e/test_metadata_porting.py
index fe9e1b295561b..88e1c9cad7cba 100644
--- a/test/quantization/pt2e/test_metadata_porting.py
+++ b/test/quantization/pt2e/test_metadata_porting.py
@@ -102,7 +102,7 @@ def _test_metadata_porting(
 
         # program capture
         m = copy.deepcopy(m_eager)
-        m = torch.export.export_for_training(m, example_inputs, strict=True).module()
+        m = torch.export.export(m, example_inputs, strict=True).module()
 
         m = prepare_pt2e(m, quantizer)
         # Calibrate
diff --git a/test/quantization/pt2e/test_numeric_debugger.py b/test/quantization/pt2e/test_numeric_debugger.py
index 53c7939411631..a0269bf635b9d 100644
--- a/test/quantization/pt2e/test_numeric_debugger.py
+++ b/test/quantization/pt2e/test_numeric_debugger.py
@@ -4,6 +4,8 @@
 import unittest
 from collections import Counter
 
+from packaging import version
+
 import torch
 from torch.ao.quantization import (
     compare_results,
@@ -19,7 +21,7 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
-from torch.export import export_for_training
+from torch.export import export
 from torch.testing._internal.common_quantization import TestHelperModules
 from torch.testing._internal.common_utils import (
     IS_WINDOWS,
@@ -29,6 +31,10 @@
 )
 
 
+if version.parse(torch.__version__) >= version.parse("2.8.0"):
+    torch._dynamo.config.cache_size_limit = 128
+
+
 @unittest.skipIf(IS_WINDOWS, "Windows not yet supported for torch.compile")
 class TestNumericDebugger(TestCase):
     def _assert_each_node_has_debug_handle(self, model) -> None:
@@ -86,7 +92,7 @@ def _extract_debug_handles_with_prev_decomp_op_from_node(node):
     def test_simple(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs, strict=True)
+        ep = export(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         self._assert_each_node_has_debug_handle(ep)
         debug_handle_map = self._extract_debug_handles(ep)
@@ -96,7 +102,7 @@ def test_simple(self):
     def test_control_flow(self):
         m = TestHelperModules.ControlFlow()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs, strict=True)
+        ep = export(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
 
         self._assert_each_node_has_debug_handle(ep)
@@ -107,7 +113,7 @@ def test_control_flow(self):
     def test_quantize_pt2e_preserve_handle(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs, strict=True)
+        ep = export(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         m = ep.module()
 
@@ -167,14 +173,14 @@ def test_deepcopy_preserve_handle(self):
     def test_re_export_preserve_handle(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs, strict=True)
+        ep = export(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         m = ep.module()
 
         self._assert_each_node_has_debug_handle(ep)
         debug_handle_map_ref = self._extract_debug_handles(ep)
 
-        ep_reexport = export_for_training(m, example_inputs, strict=True)
+        ep_reexport = export(m, example_inputs, strict=True)
 
         self._assert_each_node_has_debug_handle(ep_reexport)
         debug_handle_map = self._extract_debug_handles(ep_reexport)
@@ -184,7 +190,7 @@ def test_re_export_preserve_handle(self):
     def test_run_decompositions_same_handle_id(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs, strict=True)
+        ep = export(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
 
         self._assert_each_node_has_debug_handle(ep)
@@ -209,7 +215,7 @@ def test_run_decompositions_map_handle_to_new_nodes(self):
 
         for m in test_models:
             example_inputs = m.example_inputs()
-            ep = export_for_training(m, example_inputs, strict=True)
+            ep = export(m, example_inputs, strict=True)
             generate_numeric_debug_handle(ep)
 
             self._assert_each_node_has_debug_handle(ep)
@@ -232,7 +238,7 @@ def test_run_decompositions_map_handle_to_new_nodes(self):
     def test_prepare_for_propagation_comparison(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs, strict=True)
+        ep = export(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_logger = prepare_for_propagation_comparison(m)
@@ -249,7 +255,7 @@ def test_prepare_for_propagation_comparison(self):
     def test_extract_results_from_loggers(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs, strict=True)
+        ep = export(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_ref_logger = prepare_for_propagation_comparison(m)
@@ -274,7 +280,7 @@ def test_extract_results_from_loggers(self):
     def test_extract_results_from_loggers_list_output(self):
         m = TestHelperModules.Conv2dWithSplit()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs, strict=True)
+        ep = export(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_ref_logger = prepare_for_propagation_comparison(m)
@@ -304,7 +310,7 @@ def test_extract_results_from_loggers_list_output(self):
     def test_added_node_gets_unique_id(self) -> None:
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs, strict=True)
+        ep = export(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         ref_handles = self._extract_debug_handles(ep)
         ref_counter = Counter(ref_handles.values())
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
index f9780dbf7b3df..f6d3eae233215 100644
--- a/test/quantization/pt2e/test_quantize_pt2e.py
+++ b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -39,7 +39,7 @@
     OP_TO_ANNOTATOR,
     QuantizationConfig,
 )
-from torch.export import export_for_training
+from torch.export import export
 from torch.fx import Node
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
@@ -767,7 +767,7 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         example_inputs = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5))
 
         # program capture
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         m = prepare_pt2e(m, BackendAQuantizer())
         # make sure the two observers for input are shared
         conv_output_obs = []
@@ -827,7 +827,7 @@ def _test_transitive_sharing_with_cat_helper(self, quantizer):
         )
 
         # program capture
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
         # make sure the two input observers and output are shared
@@ -1146,7 +1146,7 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         )
 
         # program capture
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         quantizer = BackendAQuantizer()
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
@@ -1296,7 +1296,7 @@ def validate(self, model: torch.fx.GraphModule) -> None:
 
         m = M().eval()
         example_inputs = torch.randn(1, 2, 3, 3)
-        m = export_for_training(m, (example_inputs,), strict=True).module()
+        m = export(m, (example_inputs,), strict=True).module()
         with self.assertRaises(Exception):
             m = prepare_pt2e(m, BackendAQuantizer())
 
@@ -1419,7 +1419,7 @@ def forward(self, x):
         quantizer.set_global(operator_config)
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         weight_meta = None
         for n in m.graph.nodes:
             if (
@@ -1506,7 +1506,7 @@ def forward(self, x):
         m = M().eval()
         quantizer = TestQuantizer()
         example_inputs = (torch.randn(1, 2, 3, 3),)
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
         node_occurrence = {
@@ -1557,7 +1557,7 @@ def forward(self, x, y, z):
             torch.randn(1, 2, 3, 3),
             torch.randn(1, 2, 3, 3),
         )
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
         node_occurrence = {
@@ -1812,7 +1812,7 @@ def forward(self, x):
 
         example_inputs = (torch.randn(1),)
         m = M().train()
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         if inplace:
             target = torch.ops.aten.dropout_.default
         else:
@@ -1877,7 +1877,7 @@ def forward(self, x):
             m = M().train()
             example_inputs = (torch.randn(1, 3, 3, 3),)
         bn_train_op, bn_eval_op = self._get_bn_train_eval_ops()
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
 
         # Assert that batch norm op exists and is in train mode
         bn_node = self._get_node(m, bn_train_op)
@@ -1908,7 +1908,7 @@ def test_disallow_eval_train(self):
         m.train()
 
         # After export: this is not OK
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         with self.assertRaises(NotImplementedError):
             m.eval()
         with self.assertRaises(NotImplementedError):
@@ -1949,7 +1949,7 @@ def forward(self, x):
             m = M().train()
             example_inputs = (torch.randn(1, 3, 3, 3),)
         bn_train_op, bn_eval_op = self._get_bn_train_eval_ops()
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
 
         def _assert_ops_are_correct(m: torch.fx.GraphModule, train: bool):
             targets = [n.target for n in m.graph.nodes]
@@ -2015,7 +2015,7 @@ def forward(self, x):
 
         m = M().train()
         example_inputs = (torch.randn(1, 3, 3, 3),)
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         torch.ao.quantization.allow_exported_model_train_eval(m)
 
         # Mock m.recompile() to count how many times it's been called
@@ -2047,7 +2047,7 @@ def _fake_recompile():
     def test_model_is_exported(self):
         m = TestHelperModules.ConvWithBNRelu(relu=True)
         example_inputs = (torch.rand(3, 3, 5, 5),)
-        exported_gm = export_for_training(m, example_inputs, strict=True).module()
+        exported_gm = export(m, example_inputs, strict=True).module()
         fx_traced_gm = torch.fx.symbolic_trace(m, example_inputs)
         self.assertTrue(
             torch.ao.quantization.pt2e.export_utils.model_is_exported(exported_gm)
@@ -2065,9 +2065,7 @@ def test_reentrant(self):
         quantizer = XNNPACKQuantizer().set_global(
             get_symmetric_quantization_config(is_per_channel=True, is_qat=True)
         )
-        m.conv_bn_relu = export_for_training(
-            m.conv_bn_relu, example_inputs, strict=True
-        ).module()
+        m.conv_bn_relu = export(m.conv_bn_relu, example_inputs, strict=True).module()
         m.conv_bn_relu = prepare_qat_pt2e(m.conv_bn_relu, quantizer)
         m(*example_inputs)
         m.conv_bn_relu = convert_pt2e(m.conv_bn_relu)
@@ -2075,7 +2073,7 @@ def test_reentrant(self):
         quantizer = XNNPACKQuantizer().set_module_type(
             torch.nn.Linear, get_symmetric_quantization_config(is_per_channel=False)
         )
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         m = prepare_pt2e(m, quantizer)
         m = convert_pt2e(m)
 
@@ -2123,14 +2121,9 @@ def test_groupwise_per_channel_quant(self):
         m(*example_inputs)
 
     def test_observer_callback(self):
-        from torch.library import impl, Library
+        from torch.library import custom_op
 
-        test_lib = Library("test_int4", "DEF")  # noqa: TOR901
-        test_lib.define(
-            "quantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor"
-        )
-
-        @impl(test_lib, "quantize_per_tensor_int4", "CompositeExplicitAutograd")
+        @custom_op("test_int4::quantize_per_tensor_int4", mutates_args=())
         def quantize_per_tensor_int4(
             input: torch.Tensor,
             scale: float,
@@ -2143,11 +2136,7 @@ def quantize_per_tensor_int4(
                 .view(torch.bits8)
             )
 
-        test_lib.define(
-            "dequantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor"
-        )
-
-        @impl(test_lib, "dequantize_per_tensor_int4", "CompositeExplicitAutograd")
+        @custom_op("test_int4::dequantize_per_tensor_int4", mutates_args=())
         def dequantize_per_tensor_int4(
             input: torch.Tensor,
             scale: float,
@@ -2247,7 +2236,7 @@ def test_speed(self):
 
         def dynamic_quantize_pt2e(model, example_inputs):
             torch._dynamo.reset()
-            model = export_for_training(model, example_inputs, strict=True).module()
+            model = export(model, example_inputs, strict=True).module()
             # Per channel quantization for weight
             # Dynamic quantization for activation
             # Please read a detail: https://fburl.com/code/30zds51q
@@ -2462,7 +2451,7 @@ def forward(self, x):
 
         example_inputs = (torch.randn(1, 3, 5, 5),)
         m = M()
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         quantizer = XNNPACKQuantizer().set_global(
             get_symmetric_quantization_config(),
         )
@@ -2544,7 +2533,7 @@ def prepare_obs_or_fq_callback(
                     edge_or_node_to_obs_or_fq[x] = new_observer
 
         example_inputs = (torch.rand(1, 32, 16, 16),)
-        gm = export_for_training(Model().eval(), example_inputs, strict=True).module()
+        gm = export(Model().eval(), example_inputs, strict=True).module()
         gm = prepare_pt2e(gm, BackendAQuantizer())
         gm = convert_pt2e(gm)
         for n in gm.graph.nodes:
@@ -2571,9 +2560,7 @@ def check_nn_module(node):
                 "ConvWithBNRelu" in node.meta["nn_module_stack"]["L__self__"][1]
             )
 
-        m.conv_bn_relu = export_for_training(
-            m.conv_bn_relu, example_inputs, strict=True
-        ).module()
+        m.conv_bn_relu = export(m.conv_bn_relu, example_inputs, strict=True).module()
         for node in m.conv_bn_relu.graph.nodes:
             if node.op not in ["placeholder", "output", "get_attr"]:
                 check_nn_module(node)
diff --git a/test/quantization/pt2e/test_quantize_pt2e_qat.py b/test/quantization/pt2e/test_quantize_pt2e_qat.py
index 98682dc14e079..ca80439bbf34c 100644
--- a/test/quantization/pt2e/test_quantize_pt2e_qat.py
+++ b/test/quantization/pt2e/test_quantize_pt2e_qat.py
@@ -34,7 +34,7 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
-from torch.export import export_for_training
+from torch.export import export
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
@@ -140,9 +140,7 @@ def _verify_symmetric_xnnpack_qat_numerics_helper(
                 is_per_channel=is_per_channel, is_qat=True
             )
         )
-        model_pt2e = export_for_training(
-            model_pt2e, example_inputs, strict=True
-        ).module()
+        model_pt2e = export(model_pt2e, example_inputs, strict=True).module()
         model_pt2e = prepare_qat_pt2e(model_pt2e, quantizer)
         torch.manual_seed(MANUAL_SEED)
         after_prepare_result_pt2e = model_pt2e(*example_inputs)
@@ -229,7 +227,7 @@ def _verify_symmetric_xnnpack_qat_graph_helper(
         quantizer.set_global(
             get_symmetric_quantization_config(is_per_channel, is_qat=True)
         )
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
 
@@ -618,7 +616,7 @@ def forward(self, x):
         m = M(self.conv_class, self.bn_class, backbone)
         quantizer = XNNPACKQuantizer()
         quantizer.set_global(get_symmetric_quantization_config(is_qat=True))
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
         m = convert_pt2e(m)
@@ -676,7 +674,7 @@ def get_source_fn(node: torch.fx.Node):
     def test_qat_conv_bn_bias_derived_qspec(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         quantizer = ConvBnDerivedBiasQuantizer()
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
@@ -723,7 +721,7 @@ def test_qat_conv_bn_bias_derived_qspec(self):
     def test_qat_per_channel_weight_custom_dtype(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         quantizer = ConvBnInt32WeightQuantizer()
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
@@ -777,7 +775,7 @@ def test_qat_conv_transpose_bn_relu(self):
     def test_qat_conv_bn_per_channel_weight_bias(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         quantizer = ConvBnDerivedBiasQuantizer(is_per_channel=True)
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
@@ -834,7 +832,7 @@ def test_fold_bn_erases_bn_node(self):
         it into conv in `convert_pt2e` even in train mode.
         """
         m = self._get_conv_bn_model(has_conv_bias=False, has_bn=True, has_relu=False)
-        m = export_for_training(m, self.example_inputs, strict=True).module()
+        m = export(m, self.example_inputs, strict=True).module()
         quantizer = XNNPACKQuantizer()
         quantizer.set_global(
             get_symmetric_quantization_config(is_per_channel=False, is_qat=True),
@@ -850,7 +848,7 @@ def test_fold_bn_erases_add_node(self):
         Test that batch norm stat tracking (which results in an add_ tensor) is removed when folding batch norm.
         """
         m = self._get_conv_bn_model(has_conv_bias=False, has_bn=True, has_relu=False)
-        m = export_for_training(m, self.example_inputs, strict=True).module()
+        m = export(m, self.example_inputs, strict=True).module()
 
         def _has_add_(graph):
             for node in graph.nodes:
@@ -1115,9 +1113,7 @@ def _prepare_qat_linears(self, model):
                     in_channels = child.linear1.weight.size(1)
 
                 example_input = (torch.rand((1, in_channels)),)
-                traced_child = export_for_training(
-                    child, example_input, strict=True
-                ).module()
+                traced_child = export(child, example_input, strict=True).module()
                 quantizer = XNNPACKQuantizer()
                 quantization_config = get_symmetric_quantization_config(
                     is_per_channel=True, is_qat=True
@@ -1148,7 +1144,7 @@ def test_mixing_qat_ptq(self):
         self._convert_qat_linears(model)
         model(*example_inputs)
 
-        model_pt2e = export_for_training(model, example_inputs, strict=True).module()
+        model_pt2e = export(model, example_inputs, strict=True).module()
 
         quantizer = XNNPACKQuantizer()
         quantizer.set_module_type(torch.nn.Linear, None)
diff --git a/test/quantization/pt2e/test_representation.py b/test/quantization/pt2e/test_representation.py
index 5c5a7cce505b6..1c97dd6a73862 100644
--- a/test/quantization/pt2e/test_representation.py
+++ b/test/quantization/pt2e/test_representation.py
@@ -10,7 +10,7 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
-from torch.export import export_for_training
+from torch.export import export
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
     QuantizationTestCase,
@@ -34,7 +34,7 @@ def _test_representation(
     ) -> torch.nn.Module:
         # resetting dynamo cache
         torch._dynamo.reset()
-        model = export_for_training(model, example_inputs, strict=True).module()
+        model = export(model, example_inputs, strict=True).module()
         model_copy = copy.deepcopy(model)
 
         model = prepare_pt2e(model, quantizer)
diff --git a/test/quantization/pt2e/test_x86inductor_quantizer.py b/test/quantization/pt2e/test_x86inductor_quantizer.py
index 1f1020b9bd41c..6c83ab1a869ed 100644
--- a/test/quantization/pt2e/test_x86inductor_quantizer.py
+++ b/test/quantization/pt2e/test_x86inductor_quantizer.py
@@ -17,7 +17,7 @@
     QUANT_ANNOTATION_KEY,
     X86InductorQuantizer,
 )
-from torch.export import export_for_training
+from torch.export import export
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
     QuantizationTestCase,
@@ -668,7 +668,7 @@ def _test_quantizer(
 
         # program capture
         m = copy.deepcopy(m_eager)
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
 
         # QAT Model failed to deepcopy
         export_model = m if is_qat else copy.deepcopy(m)
@@ -2344,7 +2344,7 @@ def forward(self, x):
         )
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         m = prepare_pt2e(m, quantizer)
         # Use a linear count instead of names because the names might change, but
         # the order should be the same.
diff --git a/test/quantization/pt2e/test_xnnpack_quantizer.py b/test/quantization/pt2e/test_xnnpack_quantizer.py
index 37bac5c8f51f9..3baec3f8004b1 100644
--- a/test/quantization/pt2e/test_xnnpack_quantizer.py
+++ b/test/quantization/pt2e/test_xnnpack_quantizer.py
@@ -29,7 +29,7 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
-from torch.export import export_for_training
+from torch.export import export
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
     PT2EQuantizationTestCase,
@@ -362,7 +362,7 @@ def forward(self, x):
         )
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         m = prepare_pt2e(m, quantizer)
         # Use a linear count instead of names because the names might change, but
         # the order should be the same.
@@ -498,7 +498,7 @@ def test_propagate_annotation(self):
         example_inputs = (torch.randn(1, 3, 5, 5),)
 
         # program capture
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
 
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
@@ -763,9 +763,7 @@ def forward(self, input_tensor, hidden_tensor):
             model_fx = _convert_to_reference_decomposed_fx(model_fx)
 
             with torchdynamo.config.patch(allow_rnn=True):
-                model_graph = export_for_training(
-                    model_graph, example_inputs, strict=True
-                ).module()
+                model_graph = export(model_graph, example_inputs, strict=True).module()
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(
                 is_per_channel=False, is_dynamic=False
@@ -825,9 +823,7 @@ def forward(self, input_tensor, hidden_tensor):
             model_fx = _convert_to_reference_decomposed_fx(model_fx)
 
             with torchdynamo.config.patch(allow_rnn=True):
-                model_graph = export_for_training(
-                    model_graph, example_inputs, strict=True
-                ).module()
+                model_graph = export(model_graph, example_inputs, strict=True).module()
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(
                 is_per_channel=False, is_dynamic=False
@@ -1035,7 +1031,7 @@ def test_resnet18(self):
             m = torchvision.models.resnet18().eval()
             m_copy = copy.deepcopy(m)
             # program capture
-            m = export_for_training(m, example_inputs, strict=True).module()
+            m = export(m, example_inputs, strict=True).module()
 
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(is_per_channel=True)
diff --git a/test/run_test.py b/test/run_test.py
index c132448e33ee5..da74a80bd929b 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -38,6 +38,7 @@
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     TEST_WITH_SLOW_GRADCHECK,
+    TEST_XPU,
 )
 
 
@@ -175,6 +176,7 @@ def __contains__(self, item):
     "test_jit_legacy",
     "test_cuda_nvml_based_avail",
     "test_jit_cuda_fuser",
+    "test_openreg",
 ]
 
 if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor <= 9):
@@ -243,6 +245,7 @@ def __contains__(self, item):
     # depend on z3-solver
     "fx/test_z3_gradual_types",
     "test_proxy_tensor",
+    "test_openreg",
 ]
 
 XPU_BLOCKLIST = [
@@ -399,6 +402,7 @@ def __contains__(self, item):
 ]
 FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")]
 ONNX_TESTS = [test for test in TESTS if test.startswith("onnx")]
+QUANTIZATION_TESTS = [test for test in TESTS if test.startswith("test_quantization")]
 
 
 def _is_cpp_test(test):
@@ -843,8 +847,9 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
         exts_to_build = [
             (install_cmd, "no_python_abi_suffix_test"),
         ]
-        if TEST_CUDA:
+        if TEST_CUDA or TEST_XPU:
             exts_to_build.append((wheel_cmd, "python_agnostic_extension"))
+        if TEST_CUDA:
             exts_to_build.append((install_cmd, "libtorch_agnostic_extension"))
         for cmd, extension_dir in exts_to_build:
             return_code = shell(
@@ -920,7 +925,12 @@ def _test_autoload(test_directory, options, enable=True):
         os.environ.pop("TORCH_DEVICE_BACKEND_AUTOLOAD")
 
 
-def run_test_with_openreg(test_module, test_directory, options):
+# test_openreg is designed to run all tests under torch_openreg, which
+# is an torch backend similar to CUDA or MPS and implemented by using
+# third-party accelerator integration mechanism. Therefore, if all the
+# tests under torch_openreg are passing, it can means that the mechanism
+# mentioned above is working as expected.
+def test_openreg(test_module, test_directory, options):
     openreg_dir = os.path.join(
         test_directory, "cpp_extensions", "open_registration_extension", "torch_openreg"
     )
@@ -929,7 +939,16 @@ def run_test_with_openreg(test_module, test_directory, options):
         return return_code
 
     with extend_python_path([install_dir]):
-        return run_test(test_module, test_directory, options)
+        cmd = [
+            sys.executable,
+            "-m",
+            "unittest",
+            "discover",
+            "-s",
+            os.path.join(openreg_dir, "tests"),
+            "-v",
+        ]
+        return shell(cmd, cwd=test_directory, env=os.environ)
 
 
 def test_distributed(test_module, test_directory, options):
@@ -1181,12 +1200,14 @@ def handle_log_file(
 
 
 def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
-    if RERUN_DISABLED_TESTS:
-        # Distributed tests are too slow, so running them x50 will cause the jobs to timeout after
+    if is_distributed_test:
+        # Distributed tests do not support rerun, see https://github.com/pytorch/pytorch/issues/162978
+        rerun_options = ["-x", "--reruns=0"]
+    elif RERUN_DISABLED_TESTS:
+        # ASAN tests are too slow, so running them x50 will cause the jobs to timeout after
         # 3+ hours. So, let's opt for less number of reruns. We need at least 150 instances of the
-        # test every 2 weeks to satisfy the SQL query (15 x 14 = 210). The same logic applies
-        # to ASAN, which is also slow
-        count = 15 if is_distributed_test or TEST_WITH_ASAN else 50
+        # test every 2 weeks to satisfy the SQL query (15 x 14 = 210).
+        count = 15 if TEST_WITH_ASAN else 50
         # When under rerun-disabled-tests mode, run the same tests multiple times to determine their
         # flakiness status. Default to 50 re-runs
         rerun_options = ["--flake-finder", f"--flake-runs={count}"]
@@ -1263,8 +1284,7 @@ def run_ci_sanity_check(test: ShardedTest, test_directory, options):
     "test_ci_sanity_check_fail": run_ci_sanity_check,
     "test_autoload_enable": test_autoload_enable,
     "test_autoload_disable": test_autoload_disable,
-    "test_openreg": run_test_with_openreg,
-    "test_transformers_privateuse1": run_test_with_openreg,
+    "test_openreg": test_openreg,
 }
 
 
@@ -1458,6 +1478,11 @@ def parse_args():
         action="store_true",
         help="exclude inductor tests",
     )
+    parser.add_argument(
+        "--exclude-quantization-tests",
+        action="store_true",
+        help="exclude quantization tests",
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -1631,6 +1656,9 @@ def get_selected_tests(options) -> list[str]:
     if options.exclude_aot_dispatch_tests:
         options.exclude.extend(AOT_DISPATCH_TESTS)
 
+    if options.exclude_quantization_tests:
+        options.exclude.extend(QUANTIZATION_TESTS)
+
     # these tests failing in CUDA 11.6 temporary disabling. issue https://github.com/pytorch/pytorch/issues/75375
     if torch.version.cuda is not None:
         options.exclude.extend(["distributions/test_constraints"])
diff --git a/test/slow_tests.json b/test/slow_tests.json
index cd9d6864f0ec4..5a35d23776a31 100644
--- a/test/slow_tests.json
+++ b/test/slow_tests.json
@@ -1,244 +1,249 @@
 {
-  "EndToEndLSTM (__main__.RNNTest)": 194.9510040283203,
-  "MultiheadAttention (__main__.ModulesTest)": 140.13499959309897,
-  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 89.57710986667209,
-  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 64.31833351982965,
-  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 66.09833272298177,
-  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.02314267839704,
-  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 72.13800048828125,
-  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 63.19166692097982,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 153.9259999593099,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 214.78533426920572,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 158.7769978841146,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 60.201476414998375,
-  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 75.8566665649414,
-  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 158.88999938964844,
-  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 600.0303955078125,
-  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 143.89337348937988,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 494.34210883246527,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 504.5401102701823,
-  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 135.9231694539388,
-  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 71.03799947102864,
-  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 73.23316764831543,
-  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 214.73055691189236,
-  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 150.5653305053711,
-  "test_cat_2k_args (__main__.TestTEFuserDynamic)": 121.138150700114,
-  "test_cat_2k_args (__main__.TestTEFuserStatic)": 117.27021219874874,
-  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 332.1435546875,
-  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 413.1364440917969,
-  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 322.539549085829,
-  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 109.46066538492839,
-  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 110.44916661580403,
-  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 77.25650024414062,
-  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 75.41433461507161,
-  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 111.43533325195312,
-  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 113.98733520507812,
-  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 485.4573465983073,
-  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 464.56699625651044,
-  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 265.6348292032878,
-  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 314.0461654663086,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1546.3898315429688,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 69.4828332265218,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1384.938496907552,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 73.32633463541667,
-  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.70183436075847,
-  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 76.88016764322917,
-  "test_comprehensive_linalg_pinv_singular_cuda_complex128 (__main__.TestDecompCUDA)": 60.60533459981283,
-  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 83.5096664428711,
-  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 78.69066619873047,
-  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 92.91299947102864,
-  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 73.34999974568684,
-  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 70.28683344523112,
-  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.44366518656413,
-  "test_comprehensive_logspace_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 77.09783299763997,
-  "test_comprehensive_logspace_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 70.4760004679362,
-  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 142.64183044433594,
-  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 137.7250010172526,
-  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 138.17566553751627,
-  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 69.95266660054524,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 60.835333506266274,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 66.94753379821778,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 138.8831672668457,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 157.37983194986978,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 148.48499552408853,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 142.54666646321616,
-  "test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 66.76000086466472,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 70.30716641743977,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 340.98316701253253,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 314.614995320638,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.2018330891927,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 85.09549967447917,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 88.72550201416016,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 85.59499867757161,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cpu_float32 (__main__.TestDecompCPU)": 61.82139994303385,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 141.1143341064453,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 142.72383499145508,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1356.413838704427,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1347.1215209960938,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1366.5043131510417,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 593.5763346354166,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 549.9474945068359,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 74.53666687011719,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 75.8316650390625,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 74.80666669209798,
-  "test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 67.3658332824707,
-  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 67.6716677347819,
-  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 120.74283218383789,
-  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 117.90700022379558,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 74.16149965922038,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.09249877929688,
-  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 68.72949981689453,
-  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 76.05216598510742,
-  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 79.25549952189128,
-  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 124.02233123779297,
-  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 130.15816497802734,
-  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 114.52783139546712,
-  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 94.13066546122234,
-  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 243.25878143310547,
-  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 560.9872216118706,
-  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 85.30400085449219,
-  "test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.0622667948405,
-  "test_conv2d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.94093297322591,
-  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 164.94733174641928,
-  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 67.41599782307942,
-  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 80.62599987453885,
-  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 77.90822347005208,
-  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 88.02899932861328,
-  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 83.99416732788086,
-  "test_count_nonzero_all (__main__.TestBool)": 625.3162163628472,
-  "test_custom_module_lstm (__main__.TestQuantizedOps)": 691.5127597384983,
-  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 86.18333435058594,
-  "test_eager_sequence_nr_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 146.76594623766448,
-  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 341.765677134196,
-  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 68.25488874647353,
-  "test_fail_random.py (__main__.TestTyping)": 69.70459224559643,
-  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 99.30016708374023,
-  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 90.32933298746745,
-  "test_fuse_large_params_cpu (__main__.CpuTests)": 100.9027509689331,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 156.06466674804688,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 154.44311014811197,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 140.33400217692056,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 108.87950007120769,
-  "test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 78.21525671543219,
-  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 95.37383270263672,
-  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 124.23833465576172,
-  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 130.07466634114584,
-  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 228.14850107828775,
-  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 141.07866414388022,
-  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 155.69166564941406,
-  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 638.5084838867188,
-  "test_group_norm (__main__.TestQuantizedOps)": 235.64022382100424,
-  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 328.87933349609375,
-  "test_inductor_dynamic_shapes_broadcasting_dynamic_shapes (__main__.DynamicShapesReproTests)": 116.18105255930047,
-  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 70.07888836330838,
-  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 89.06283315022786,
-  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 131.60088857014975,
-  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.61966451009114,
-  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 131.74433390299478,
-  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.52466583251953,
-  "test_linear (__main__.TestStaticQuantizedModule)": 219.97832912868924,
-  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 111.1229985555013,
-  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 139.29833475748697,
-  "test_linear_relu (__main__.TestStaticQuantizedModule)": 222.60332700941296,
-  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 137.30917072296143,
-  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.62766689724393,
-  "test_low_memory_max_pool_dilation_1_dim_3_cpu_halide (__main__.HalideCpuTests)": 585.4219970703125,
-  "test_low_memory_max_pool_dilation_2_dim_3_cpu_halide (__main__.HalideCpuTests)": 504.6419982910156,
-  "test_lstm_cpu (__main__.TestMkldnnCPU)": 69.61133321126302,
-  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 127.47244517008464,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.23977788289388,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 63.10499954223633,
-  "test_nan_assert_float16 (__main__.ProcessGroupNCCLGroupTest)": 105.55233224232991,
-  "test_pattern_matcher_multi_user_cpu (__main__.CpuTritonTests)": 148.99966939290366,
-  "test_proper_exit (__main__.TestDataLoader)": 195.07049942016602,
-  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 238.3838322957357,
-  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 180.44411044650607,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 64.31058961917192,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 62.13955030441284,
-  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 141.32811228434244,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.34100087483723,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 84.88599904378255,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 77.63999938964844,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.23133341471355,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.41600036621094,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.7643305460612,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 85.55433400472005,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 86.17699940999348,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 76.47133382161458,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 98.72666676839192,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.08499908447266,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 79.43900044759114,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 87.4413324991862,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.52833302815755,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.18200174967448,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 91.71099853515625,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.84733327229817,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 89.47599792480469,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 89.17300160725911,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 96.56466674804688,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.08200073242188,
-  "test_qrnncell (__main__.TestDynamicQuantizedOps)": 200.46322377522787,
-  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 637.5349934895834,
-  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1213.9888509114583,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 759.4036661783854,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1672.4736735026042,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 76.77566528320312,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 292.51483662923175,
-  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 129.11066691080728,
-  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 260.64366658528644,
-  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 73.24966684977214,
-  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 157.60366821289062,
-  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 78.70783360799153,
-  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 89.36199951171875,
-  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 193.34283447265625,
-  "test_quick_core_backward_std_cpu_float64 (__main__.TestDecompCPU)": 64.08739941914877,
-  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 126.64083353678386,
-  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 106.82166735331218,
-  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.22033437093098,
-  "test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 65.57016626993816,
-  "test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 76.09683354695638,
-  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 71.15816752115886,
-  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 74.32677883572049,
-  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 157.43183390299478,
-  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 131.13233439127603,
-  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 160.5550011528863,
-  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 117.62710995144315,
-  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 114.96744452582465,
-  "test_std (__main__.TestQuantizedOps)": 275.08810419506494,
-  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 150.82900087038675,
-  "test_terminate_handler_on_crash (__main__.TestTorch)": 110.43555479579501,
-  "test_terminate_signal (__main__.ForkTest)": 130.07055732442274,
-  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 129.6981106830968,
-  "test_terminate_signal (__main__.SpawnTest)": 133.48411263359918,
-  "test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 90.4521090189616,
-  "test_train_parity_multi_group (__main__.TestFullyShard1DTrainingCore)": 164.04612350463867,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 77.9958324432373,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 78.84283447265625,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 79.08466720581055,
-  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 127.43616739908855,
-  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 129.390500386556,
-  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 104.55349795023601,
-  "test_unary_ops (__main__.TestTEFuserDynamic)": 84.59466772609287,
-  "test_unary_ops (__main__.TestTEFuserStatic)": 87.30733429061041,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 82.17999776204427,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 79.73050053914388,
-  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 87.70950190226237,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 96.42566680908203,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.90966542561848,
-  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 62.53285598754883,
-  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 91.11416816711426,
-  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 86.59666760762532,
-  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 93.32300059000652,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 100.57566833496094,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 116.00733248392741,
-  "test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 62.26690483093262,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 87.44200134277344,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 133.6548334757487,
-  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 114.57983334859212,
-  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 69.25033442179362,
-  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 124.68766911824544,
-  "test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 76.81024932861328,
-  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 140.70899963378906,
-  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 118.22750091552734,
-  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 181.27366256713867
+  "EndToEndLSTM (__main__.RNNTest)": 197.77900187174478,
+  "MultiheadAttention (__main__.ModulesTest)": 137.42000325520834,
+  "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 214.1816660563151,
+  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 91.37688869900174,
+  "test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 116.57933298746745,
+  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 66.92922253078885,
+  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 65.68500010172527,
+  "test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 177.91966756184897,
+  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 87.69499969482422,
+  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 74.02233378092448,
+  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.45699946085612,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 136.27599589029947,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 259.30466715494794,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 135.36400095621744,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 61.07166544596354,
+  "test_aot_autograd_symbolic_exhaustive_ormqr_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.8491905757359,
+  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 90.34733327229817,
+  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 140.09266916910806,
+  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_False_cpu (__main__.AssociativeScanTests)": 65.17999935150146,
+  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_True_cpu (__main__.AssociativeScanTests)": 73.75112533569336,
+  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 646.9324035644531,
+  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 142.86450004577637,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 493.49299791124133,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 498.72944810655383,
+  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 133.2033322652181,
+  "test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 61.788333892822266,
+  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 69.57333119710286,
+  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 81.06516774495442,
+  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 215.5933346218533,
+  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 135.41816584269205,
+  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 338.17533026801215,
+  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 423.4767761230469,
+  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 325.6485578748915,
+  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 111.10633341471355,
+  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 104.33766555786133,
+  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 69.72683334350586,
+  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 71.48199971516927,
+  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 96.58033243815105,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 96.65433247884114,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 464.92467244466144,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 460.3839925130208,
+  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 263.58483632405597,
+  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 298.0318349202474,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1310.3350016276042,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 66.3976656595866,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1316.084981282552,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.58183288574219,
+  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.05749893188477,
+  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 72.31333287556966,
+  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.53133392333984,
+  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 82.40500005086263,
+  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 69.91749890645345,
+  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 70.98916562398274,
+  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 126.90333302815755,
+  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 112.40283330281575,
+  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 114.09550094604492,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex128 (__main__.TestDecompCUDA)": 63.223000049591064,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 67.44083213806152,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 62.70066706339518,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 60.468666076660156,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 116.34999974568684,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 116.57566579182942,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 115.4306640625,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 114.67599741617839,
+  "test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 78.96566772460938,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 60.72616704305013,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 270.3598327636719,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 260.6623306274414,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.48316701253255,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.13166681925456,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 83.55450057983398,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 80.67749913533528,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 136.17766698201498,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 157.4010009765625,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1222.983662923177,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1228.281494140625,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1216.2643432617188,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 503.51465861002606,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 523.0736694335938,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 68.91749954223633,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 61.947166442871094,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 63.17983309427897,
+  "test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 77.92383321126302,
+  "test_comprehensive_nn_functional_unfold_cuda_complex64 (__main__.TestDecompCUDA)": 69.46137571334839,
+  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 62.2076670328776,
+  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 139.3495012919108,
+  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 124.99983469645183,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 73.96983273824056,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 73.27383422851562,
+  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 80.94216791788737,
+  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 73.65583419799805,
+  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 74.30566660563152,
+  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 112.75583267211914,
+  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 106.72283299763997,
+  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 102.85349909464519,
+  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 73.14683278401692,
+  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 137.8197758992513,
+  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 437.60955386691626,
+  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 75.4076665242513,
+  "test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 62.40233357747396,
+  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 149.36666870117188,
+  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 72.90299987792969,
+  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 81.56499862670898,
+  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 75.13744566175673,
+  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 82.20433298746745,
+  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 76.78600056966145,
+  "test_count_nonzero_all (__main__.TestBool)": 655.6186726888021,
+  "test_cpu_gpu_parity_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 80.43400009940652,
+  "test_custom_module_lstm (__main__.TestQuantizedOps)": 798.5362040201823,
+  "test_ddp_uneven_inputs (__main__.TestDistBackendWithSpawn)": 360.75275349617004,
+  "test_diff_hyperparams_sharding_strategy_str_no_shard (__main__.TestFSDPUseOrigParamsMultipleParamGroups)": 60.4433339436849,
+  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 85.3961664835612,
+  "test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDTensorOpsCPU)": 93.10799916585286,
+  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 215.1919957002004,
+  "test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 67.04866790771484,
+  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 64.6271112230089,
+  "test_fail_creation_ops.py (__main__.TestTyping)": 71.04431086573108,
+  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 88.46849950154622,
+  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 107.12216822306316,
+  "test_fuse_large_params_cpu (__main__.CpuTests)": 80.30040054321289,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 162.87633260091147,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 160.84833441840277,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 153.62799580891928,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 120.26516850789388,
+  "test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 62.87366739908854,
+  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 104.12133407592773,
+  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 117.95999908447266,
+  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 113.97000122070312,
+  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 248.1183293660482,
+  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 180.4351666768392,
+  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 160.81400299072266,
+  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 694.055165608724,
+  "test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 194.28900146484375,
+  "test_group_norm (__main__.TestQuantizedOps)": 207.3484410179986,
+  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 329.52866617838544,
+  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 67.15944459703233,
+  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 84.40099970499675,
+  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 132.7371097140842,
+  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.91166687011719,
+  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 130.4806671142578,
+  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.25733184814453,
+  "test_linear (__main__.TestStaticQuantizedModule)": 131.34678183661566,
+  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 124.32133229573567,
+  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 126.89633433024089,
+  "test_linear_relu (__main__.TestStaticQuantizedModule)": 128.11266708374023,
+  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 75.69916741053264,
+  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.60366736518012,
+  "test_lstm_cpu (__main__.TestMkldnnCPU)": 66.15800094604492,
+  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 130.17633226182727,
+  "test_max_autotune_addmm_max_autotune_gemm_backends_CK_x_shape2 (__main__.TestCKBackend)": 60.61724901199341,
+  "test_max_autotune_addmm_search_space_EXHAUSTIVE_dynamic_True (__main__.TestMaxAutotuneSubproc)": 82.76533508300781,
+  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_False (__main__.TestCKBackend)": 84.80249977111816,
+  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_True_use_aoti_False (__main__.TestCKBackend)": 82.48874931409955,
+  "test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 421.6166585286458,
+  "test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 133.6796671549479,
+  "test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 357.6593322753906,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.8608890109592,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 64.60900031195746,
+  "test_proper_exit (__main__.TestDataLoader)": 223.7907740275065,
+  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 213.6155548095703,
+  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 168.48199971516928,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 68.48926869834342,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 68.39782928838963,
+  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 99.70321994357639,
+  "test_qat_resnet18 (__main__.TestQuantizePT2EQATModels)": 61.103378822063576,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 99.00533294677734,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.10599772135417,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 75.0443344116211,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.9883321126302,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.07866668701172,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 68.79566701253255,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.1106669108073,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.92966969807942,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 75.10766855875652,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 103.41666666666667,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 96.1106669108073,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 77.91766866048177,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.16766611735027,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.9856669108073,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 93.22266642252605,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.57533264160156,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 70.04799906412761,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.56433359781902,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.017333984375,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 94.46166737874348,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.06233215332031,
+  "test_qrnncell (__main__.TestDynamicQuantizedOps)": 204.8830050362481,
+  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 584.1243489583334,
+  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1194.274678548177,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 842.1573282877604,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1500.2438354492188,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 80.01266479492188,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 304.8406728108724,
+  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 123.26833089192708,
+  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 289.4941685994466,
+  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 78.4913330078125,
+  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 160.19433085123697,
+  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 76.93316650390625,
+  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 95.25599924723308,
+  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 190.9510014851888,
+  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 115.96716562906902,
+  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 85.82816696166992,
+  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.81233215332031,
+  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 73.0594991048177,
+  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 78.28866704305013,
+  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 203.66749827067056,
+  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 118.92166392008464,
+  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 161.21966722276477,
+  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 119.33677842881944,
+  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 122.50711229112413,
+  "test_sort_stable_cpu (__main__.CpuTritonTests)": 77.22933451334636,
+  "test_split_cumsum_cpu (__main__.CpuTritonTests)": 89.92000071207683,
+  "test_std (__main__.TestQuantizedOps)": 118.49511219395532,
+  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 149.61699732144675,
+  "test_tensor_split (__main__.TestVmapOperators)": 83.01314294423376,
+  "test_terminate_handler_on_crash (__main__.TestTorch)": 111.18021970325046,
+  "test_terminate_signal (__main__.ForkTest)": 131.81088901807865,
+  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 131.90911058253712,
+  "test_terminate_signal (__main__.SpawnTest)": 135.51344219843546,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 71.71866671244304,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 69.4015007019043,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 75.85683250427246,
+  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 144.25,
+  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 142.70416514078775,
+  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 105.90866597493489,
+  "test_unary_ops (__main__.TestTEFuserDynamic)": 83.01277730200026,
+  "test_unary_ops (__main__.TestTEFuserStatic)": 84.06699878639645,
+  "test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 97.28433227539062,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 96.625,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 78.01066716512044,
+  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 82.23649978637695,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 100.44966379801433,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.67900085449219,
+  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 75.2140007019043,
+  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 100.80166753133138,
+  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 96.56916745503743,
+  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 99.54433314005534,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 69.86966705322266,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 103.45650100708008,
+  "test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 69.28766759236653,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 70.02966690063477,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 100.93566703796387,
+  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 94.60433260599773,
+  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 98.65516599019368,
+  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 105.35816828409831,
+  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 74.68983332316081,
+  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 152.76449966430664
 }
\ No newline at end of file
diff --git a/test/test_ao_sparsity.py b/test/test_ao_sparsity.py
index 5ae5a0874318e..35b96522a81cb 100644
--- a/test/test_ao_sparsity.py
+++ b/test/test_ao_sparsity.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 import logging
 
 # Kernels
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 569d1bac85958..fbbcd831397af 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1481,7 +1481,7 @@ def to_np(value):
                 elif torch.can_cast(torch.result_type(base, exponent), base.dtype):
                     actual2 = actual.pow_(exponent)
                     self.assertEqual(actual, expected.to(actual))
-                    self.assertEqual(actual2, expected.to(actual))
+                    self.assertEqual(actual2, expected.to(actual2))
                 else:
                     self.assertRaisesRegex(
                         RuntimeError,
diff --git a/test/test_cuda.py b/test/test_cuda.py
index dd36d6a94478b..c707433ebe6e4 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -762,53 +762,7 @@ def check_workspace_size(inp):
 
         torch._C._cuda_clearCublasWorkspaces()
 
-    @contextlib.contextmanager
-    def _hip_allow_tf32(self):
-        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-        # and only for MI300+
-        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-
-        try:
-            yield
-        finally:
-            if hip_allow_tf32 is not None:
-                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-            else:
-                del os.environ["HIPBLASLT_ALLOW_TF32"]
-
-    @unittest.skipIf(not TEST_WITH_ROCM, "not relevant for CUDA testing")
-    def test_hipblaslt_allow_tf32(self):
-        tf32_ctx = self._hip_allow_tf32
-        with tf32_ctx():
-            os.environ["HIPBLASLT_ALLOW_TF32"] = "0"
-            # Save original value of allow_tf32
-            orig = torch.backends.cuda.matmul.allow_tf32
-            # If allow_tf32 variable is declared as static in aten/src/ATen/Context.cpp
-            # then matmul.allow_tf32 will return False after this point even if
-            # HIP_BLASLT_ALLOW_TF32 is set to 1 and matmul.allow_tf32 is changed.
-            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-            # Toggle torch.backends.cuda.matmul.allow_tf32 couple of times.
-            torch.backends.cuda.matmul.allow_tf32 = not orig
-            test1 = torch.backends.cuda.matmul.allow_tf32
-            torch.backends.cuda.matmul.allow_tf32 = orig
-            test2 = torch.backends.cuda.matmul.allow_tf32
-            self.assertNotEqual(test1, test2)
-            # Restore original value of allow_tf32
-            torch.backends.cuda.matmul.allow_tf32 = orig
-
     def test_cublas_allow_tf32_get_set(self):
-        """
-        We only turn on TF32 for MI300 with a special env var. This is because TF32
-        is only available in MI300+ and is in experimental mode (hipblaslt support
-        is current WIP)
-        """
-        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-
-        with tf32_ctx():
-            self._test_cublas_allow_tf32_get_set_inner()
-
-    def _test_cublas_allow_tf32_get_set_inner(self):
         skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int(
             os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"]
         )
@@ -823,12 +777,6 @@ def _test_cublas_allow_tf32_get_set_inner(self):
         torch.backends.cuda.matmul.allow_tf32 = orig
 
     def test_float32_matmul_precision_get_set(self):
-        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-
-        with tf32_ctx():
-            self._test_float32_matmul_precision_get_set_inner()
-
-    def _test_float32_matmul_precision_get_set_inner(self):
         orig = torch.get_float32_matmul_precision()
         skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int(
             os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"]
@@ -1062,6 +1010,24 @@ def test_stream_event_repr(self):
         s.record_event(e)
         self.assertTrue("torch.cuda.Event" in e.__repr__())
 
+    def test_cuda_stream_protocol(self):
+        stream = torch.cuda.Stream()
+
+        self.assertTrue(hasattr(stream, "__cuda_stream__"))
+
+        result = stream.__cuda_stream__()
+
+        self.assertIsInstance(result, tuple)
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0], 0)  # Protocol version
+        self.assertEqual(result[1], stream.cuda_stream)  # Stream handle
+
+        external_stream = torch.cuda.ExternalStream(stream.cuda_stream)
+        external_result = external_stream.__cuda_stream__()
+
+        self.assertEqual(external_result[0], 0)
+        self.assertEqual(external_result[1], external_stream.cuda_stream)
+
     def test_events(self):
         stream = torch.cuda.current_stream()
         event = torch.cuda.Event(enable_timing=True)
@@ -3159,6 +3125,54 @@ def test_graph_cudnn_dropout(self):
 
         model(x)
 
+    @skipIfRocm
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    @serialTest()
+    def test_graph_checkpoint_preserve_rng_state(self):
+        torch.cuda.manual_seed(42)
+
+        def fn(x):
+            return x * torch.sigmoid(torch.randn(1, device="cuda"))
+
+        fn(torch.ones(1, device="cuda"))
+
+        torch.cuda.manual_seed(42)
+        eager_in = torch.ones(1, device="cuda", requires_grad=True)
+        eager_out = torch.utils.checkpoint.checkpoint(
+            fn, eager_in, use_reentrant=False, preserve_rng_state=True
+        )
+        (eager_in_grad,) = torch.autograd.grad(eager_out, eager_in)
+
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            graph_in = torch.ones(1, device="cuda", requires_grad=True)
+            graph_out = torch.utils.checkpoint.checkpoint(
+                fn, graph_in, use_reentrant=False, preserve_rng_state=True
+            )
+            (graph_in_grad,) = torch.autograd.grad(graph_out, graph_in)
+
+        torch.cuda.manual_seed(42)
+        g.replay()
+
+        self.assertEqual(eager_in_grad, graph_in_grad, rtol=0.0, atol=0.0)
+
+    @skipIfRocm
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    @serialTest()
+    def test_graph_manual_seed_mismatch_raises(self):
+        torch.cuda.manual_seed(0)
+        g = torch.cuda.CUDAGraph()
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.",  # noqa: B950
+        ):
+            with torch.cuda.graph(g):
+                torch.cuda.manual_seed(1)
+
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
@@ -6738,7 +6752,6 @@ def test_cuda_autocast_deprecated_warning(self):
     os.environ.get("USE_LEGACY_DRIVER", None) == "1", "Doesn't work with older driver"
 )
 class TestCompileKernel(TestCase):
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel(self):
         # Simple vector addition kernel
@@ -6805,15 +6818,13 @@ def test_compile_kernel(self):
         self.assertEqual(c_int, expected_int)
 
         # Test with header code
-        header_code = """
+        scale_kernel_source = """
         #define SCALE_FACTOR 2.0f
 
         __device__ float scale_value(float val) {
             return val * SCALE_FACTOR;
         }
-        """
 
-        scale_kernel_source = """
         __global__ void scale_tensors(const float* input, float* output, int n) {
             int i = threadIdx.x + blockIdx.x * blockDim.x;
             if (i < n)
@@ -6821,9 +6832,7 @@ def test_compile_kernel(self):
         }
         """
 
-        scale_kernel = _compile_kernel(
-            scale_kernel_source, "scale_tensors", header_code=header_code
-        )
+        scale_kernel = _compile_kernel(scale_kernel_source, "scale_tensors")
 
         input_tensor = torch.rand(N, device="cuda")
         output_tensor = torch.empty_like(input_tensor)
@@ -6848,8 +6857,77 @@ def test_compile_kernel(self):
         with self.assertRaises(RuntimeError):
             _compile_kernel(invalid_kernel_source, "invalid_kernel")
 
-    @tf32_on_and_off(0.005)
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_large_shared_memory(self):
+        kernel_source = """
+        __global__ void large_shared_memory_kernel(const float* input, float* output, int n) {
+            extern __shared__ float shared_data[];
+
+            int tid = threadIdx.x;
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+            // Load data into shared memory
+            if (idx < n) {
+                shared_data[tid] = input[idx];
+            } else {
+                shared_data[tid] = 0.0f;
+            }
+            __syncthreads();
+
+            // Perform reduction in shared memory
+            for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+                if (tid < stride) {
+                    shared_data[tid] += shared_data[tid + stride];
+                }
+                __syncthreads();
+            }
+
+            // Write result
+            if (tid == 0) {
+                output[blockIdx.x] = shared_data[0];
+            }
+        }
+        """
+
+        from torch.cuda import _compile_kernel, get_device_properties
+
+        kernel = _compile_kernel(kernel_source, "large_shared_memory_kernel")
+
+        threads_per_block = 1024  # 1024 threads * 4 bytes = 4KB, but we'll request 64KB
+        shared_mem_size = 64 * 1024  # 64KB
+
+        kernel.set_shared_memory_config(shared_mem_size)
+
+        N = 4096
+        input_data = torch.ones(N, device="cuda", dtype=torch.float32)
+        output_data = torch.zeros(4, device="cuda", dtype=torch.float32)  # 4 blocks
+
+        kernel(
+            grid=(4, 1, 1),
+            block=(threads_per_block, 1, 1),
+            args=[input_data, output_data, N],
+            shared_mem=shared_mem_size,
+        )
+
+        # Each block should sum 1024 ones = 1024
+        expected = torch.full((4,), 1024.0, dtype=torch.float32)
+        self.assertEqual(output_data.cpu(), expected)
+
+        # Test error handling with more than supported shared memory size
+        if torch.version.hip:
+            max_smem = (
+                65536
+                if get_device_properties().gcnArchName not in ["gfx950"]
+                else 160 * 1024
+            )
+        else:
+            max_smem = get_device_properties().shared_memory_per_block_optin
+        excessive_shared_mem = max_smem * 2
+
+        with self.assertRaises(RuntimeError):
+            kernel.set_shared_memory_config(excessive_shared_mem)
+
+    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel_advanced(self):
         # Test matrix multiplication
@@ -6900,7 +6978,10 @@ def test_compile_kernel_advanced(self):
 
         # Test with different compute capability if specified
         device_props = torch.cuda.get_device_properties(torch.cuda.current_device())
-        compute_cap = f"{device_props.major}{device_props.minor}"
+        if not torch.version.hip:
+            compute_cap = f"{device_props.major}{device_props.minor}"
+        else:
+            compute_cap = f"{device_props.gcnArchName}"
 
         # Recompile with explicit compute capability
         matmul_kernel_explicit = _compile_kernel(
@@ -6919,7 +7000,6 @@ def test_compile_kernel_advanced(self):
         # Verify results
         self.assertEqual(C_explicit, expected)
 
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel_as_custom_op(self):
         # Define a simple vector addition kernel
@@ -6979,11 +7059,10 @@ def _(a, b):
         expected = a + b
         torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
 
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel_custom_op_validation(self):
         kernel_source = """
-        __global__ void add_scalar(const float* input, float* output, float scalar, int n) {
+        __global__ void add_scalar(const float* input, float* output, double scalar, int n) {
             int idx = blockIdx.x * blockDim.x + threadIdx.x;
             if (idx < n) {
                 output[idx] = input[idx] + scalar;
@@ -7027,6 +7106,171 @@ def _(input_tensor, scalar):
         expected = input_data + scalar_val
         torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
 
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_double_precision(self):
+        """Test that Python floats are correctly handled as doubles in kernels."""
+        kernel_source = """
+        __global__ void test_double_precision(double* output, double value, int n) {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx < n) {
+                output[idx] = value;
+            }
+        }
+        """
+
+        from torch.cuda import _compile_kernel
+
+        compiled_kernel = _compile_kernel(kernel_source, "test_double_precision")
+
+        # Test with high precision value that would lose precision if cast to float32
+        # float32 has 7 digits of precision, so we use a value with 15 digits
+        high_precision_value = 1.23456789012345
+        n = 10
+
+        output = torch.zeros(n, device="cuda", dtype=torch.float64)
+        compiled_kernel(
+            grid=(1, 1, 1),
+            block=(256, 1, 1),
+            args=[output, high_precision_value, n],
+        )
+
+        # Verify high precision is preserved (would fail with old float32 casting)
+        expected = torch.full(
+            (n,), high_precision_value, device="cuda", dtype=torch.float64
+        )
+        torch.testing.assert_close(output, expected, rtol=1e-14, atol=1e-14)
+
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_cuda_headers(self):
+        """Test that kernels can include and use CUDA headers like cuda_fp16.h."""
+        kernel_source = """
+        #ifndef __HIPCC__
+        #include <cuda_fp16.h>
+        #endif
+
+        extern "C"
+        __global__ void half_precision_kernel(__half* output, double input_value, int n) {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx < n) {
+                output[idx] = __float2half((float)input_value);
+            }
+        }
+        """
+
+        from torch.cuda import _compile_kernel
+
+        compiled_kernel = _compile_kernel(kernel_source, "half_precision_kernel")
+
+        n = 100
+        test_value = 3.14159
+        output = torch.zeros(n, device="cuda", dtype=torch.float16)
+
+        compiled_kernel(
+            grid=(1, 1, 1),
+            block=(256, 1, 1),
+            args=[output, test_value, n],
+        )
+
+        expected = torch.full((n,), test_value, device="cuda", dtype=torch.float16)
+        torch.testing.assert_close(output, expected, rtol=1e-3, atol=1e-3)
+
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_template(self):
+        kernel_source = """
+        template<typename T>
+        __global__ void add_tensors(const T* a, const T* b, T* c, int n) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n)
+                c[i] = a[i] + b[i];
+        }
+        """
+
+        # Compile the kernel
+        from torch.cuda import _compile_kernel
+
+        add_kernel_float = _compile_kernel(kernel_source, "add_tensors<float>")
+
+        # Prepare data
+        N = 1024
+        a = torch.rand(N, device="cuda")
+        b = torch.rand(N, device="cuda")
+        c = torch.empty_like(a)
+
+        # Calculate grid and block dimensions
+        threads_per_block = 256
+        blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
+
+        # Launch kernel
+        add_kernel_float(
+            grid=(blocks_per_grid, 1, 1),
+            block=(threads_per_block, 1, 1),
+            args=[a, b, c, N],
+        )
+
+        # Verify results
+        expected = a + b
+        self.assertEqual(c, expected)
+
+        # do again with different dtype
+        add_kernel_int = _compile_kernel(kernel_source, "add_tensors<int>")
+
+        # Prepare data
+        N = 1024
+        a = torch.randint(-1000, 1000, size=(N,), dtype=torch.int, device="cuda")
+        b = torch.randint(-1000, 1000, size=(N,), dtype=torch.int, device="cuda")
+        c = torch.empty_like(a)
+
+        # Calculate grid and block dimensions
+        threads_per_block = 256
+        blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
+
+        # Launch kernel
+        add_kernel_int(
+            grid=(blocks_per_grid, 1, 1),
+            block=(threads_per_block, 1, 1),
+            args=[a, b, c, N],
+        )
+
+        # Verify results
+        expected = a + b
+        self.assertEqual(c, expected)
+
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_dlpack(self):
+        """Test that compile_kernel works with tensors created via DLPack."""
+        kernel_source = """
+        __global__ void add_tensors(const float* a, const float* b, float* c, int n) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n)
+                c[i] = a[i] + b[i];
+        }
+        """
+
+        from torch.cuda import _compile_kernel
+
+        add_kernel = _compile_kernel(kernel_source, "add_tensors")
+
+        N = 512
+        a = torch.rand(N, device="cuda", dtype=torch.float32)
+        b = torch.rand(N, device="cuda", dtype=torch.float32)
+
+        a_dlpack = torch.utils.dlpack.from_dlpack(torch.utils.dlpack.to_dlpack(a))
+        b_dlpack = torch.utils.dlpack.from_dlpack(torch.utils.dlpack.to_dlpack(b))
+        c = torch.empty_like(a)
+
+        threads_per_block = 256
+        blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
+
+        add_kernel(
+            grid=(blocks_per_grid, 1, 1),
+            block=(threads_per_block, 1, 1),
+            args=[a_dlpack, b_dlpack, c, N],
+        )
+
+        self.assertEqual(c, a + b)
+        a_dlpack[0] = 42.0
+        self.assertEqual(a[0].item(), 42.0, "DLPack tensors should share memory")
+
 
 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
 class TestCudaDeviceParametrized(TestCase):
@@ -7111,7 +7355,7 @@ def test_graph_external_wait_and_record(self):
 
             # This writes allows wait_for_cpu to proceed
             # This is an atomic store at system scope according to this rule:
-            # "the scope is thread_scope_system and and it is a load or store that affects a naturally-aligned object of sizes 1, 2, 4, 8, or 16 bytes on mapped memory"  # noqa: B950
+            # "the scope is thread_scope_system and it is a load or store that affects a naturally-aligned object of sizes 1, 2, 4, 8, or 16 bytes on mapped memory"  # noqa: B950
             # https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_model.html#atomicity
 
             # Note that every CPU store is implicitly system scope,
diff --git a/test/test_cuda_primary_ctx.py b/test/test_cuda_primary_ctx.py
index 284d048e9e080..7ce0b19ce884f 100644
--- a/test/test_cuda_primary_ctx.py
+++ b/test/test_cuda_primary_ctx.py
@@ -4,12 +4,8 @@
 import unittest
 
 import torch
-from torch.testing._internal.common_cuda import (
-    _get_torch_cuda_version,
-    TEST_CUDA,
-    TEST_MULTIGPU,
-)
-from torch.testing._internal.common_utils import NoTest, run_tests, TestCase
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
+from torch.testing._internal.common_utils import NoTest, run_tests, skipIfRocm, TestCase
 
 
 # NOTE: this needs to be run in a brand new process
@@ -35,18 +31,17 @@ def setUp(self):
                 TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG,
             )
 
+    @skipIfRocm(
+        msg="last checked in ROCm 7, HIP runtime doesn't create context for hipSetDevice()"
+    )
     def test_set_device_0(self):
         # In CUDA 12 the behavior of cudaSetDevice has changed. It eagerly creates context on target.
         # The behavior of `torch.cuda.set_device(0)` should also create context on the device 0.
         # Initially, we should not have any context on device 0.
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
         torch.cuda.set_device(0)
-        if _get_torch_cuda_version() >= (12, 0):
-            # Now after the device was set, the context should present in CUDA 12.
-            self.assertTrue(torch._C._cuda_hasPrimaryContext(0))
-        else:
-            # In CUDA 11 the context should not be created.
-            self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
+        # Now after the device was set, the context should present in CUDA 12.
+        self.assertTrue(torch._C._cuda_hasPrimaryContext(0))
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_str_repr(self):
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 8c98181e8b99e..da0c120822448 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -25,6 +25,7 @@
 from torch.testing._internal.common_utils import (
     IS_CI,
     IS_JETSON,
+    IS_MACOS,
     IS_S390X,
     IS_SANDCASTLE,
     IS_WINDOWS,
@@ -3472,6 +3473,10 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers):
             if current_worker_idx == num_workers:
                 current_worker_idx = 0
 
+    @unittest.skipIf(
+        IS_WINDOWS or IS_MACOS,
+        "Flaky on Windows and MacOS https://github.com/pytorch/pytorch/issues/68643",
+    )
     def test_ind_worker_queue(self):
         max_num_workers = None
         if hasattr(os, "sched_getaffinity"):
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 2a57bef2075b8..cb8dd252ec4b0 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -3356,63 +3356,6 @@ def construct_sharded_pipe():
         with self.assertRaises(Exception):
             dp.apply_sharding(2, 1, sharding_group=SHARDING_PRIORITIES.DEFAULT)
 
-    # Test tud.datapipes.iter.grouping.SHARDING_PRIORITIES for backward compatibility
-    # TODO: Remove this test once tud.datapipes.iter.grouping.SHARDING_PRIORITIES is deprecated
-    def test_sharding_groups_in_legacy_grouping_package(self):
-        with self.assertWarnsRegex(
-            FutureWarning,
-            r"Please use `SHARDING_PRIORITIES` "
-            "from the `torch.utils.data.datapipes.iter.sharding`",
-        ):
-            from torch.utils.data.datapipes.iter.grouping import (
-                SHARDING_PRIORITIES as LEGACY_SHARDING_PRIORITIES,
-            )
-
-        def construct_sharded_pipe():
-            sharding_pipes = []
-            dp = NumbersDataset(size=90)
-            dp = dp.sharding_filter(
-                sharding_group_filter=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED
-            )
-            sharding_pipes.append(dp)
-            dp = dp.sharding_filter(
-                sharding_group_filter=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
-            )
-            sharding_pipes.append(dp)
-            dp = dp.sharding_filter(sharding_group_filter=300)
-            sharding_pipes.append(dp)
-            return dp, sharding_pipes
-
-        dp, sharding_pipes = construct_sharded_pipe()
-
-        for pipe in sharding_pipes:
-            pipe.apply_sharding(
-                2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED
-            )
-            pipe.apply_sharding(
-                5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
-            )
-            pipe.apply_sharding(3, 1, sharding_group=300)
-
-        actual = list(dp)
-        expected = [17, 47, 77]
-        self.assertEqual(expected, actual)
-        self.assertEqual(3, len(dp))
-
-        dp, _ = construct_sharded_pipe()
-        dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
-        with self.assertRaises(Exception):
-            dp.apply_sharding(
-                5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
-            )
-
-        dp, _ = construct_sharded_pipe()
-        dp.apply_sharding(
-            5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
-        )
-        with self.assertRaises(Exception):
-            dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
-
     def test_legacy_custom_sharding(self):
         dp = self._get_pipeline()
         sharded_dp = CustomShardingIterDataPipe(dp)
diff --git a/test/test_decomp.py b/test/test_decomp.py
index 5a2e427057460..610465db4c485 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -1343,6 +1343,55 @@ def test_aten_core_operators(self):
         core_aten_ops = useful_decomps - core_decomps
         self.assertExpected("".join(sorted(op.name() + "\n" for op in core_aten_ops)))
 
+    def test_conv1d_decomposition(self):
+        from torch._inductor.decomposition import conv1d_to_conv2d
+
+        def check_case(
+            N=2,
+            C_in=3,
+            C_out=5,
+            L=37,
+            K=5,
+            stride=2,
+            padding=3,
+            dilation=1,
+            groups=1,
+            dtype=torch.float32,
+            device="cpu",
+        ):
+            torch.manual_seed(0)
+            x = torch.randn(N, C_in, L, dtype=dtype, device=device)
+            w = torch.randn(C_out, C_in // groups, K, dtype=dtype, device=device)
+            b = torch.randn(C_out, dtype=dtype, device=device)
+
+            ref = torch.ops.aten.conv1d.default(
+                x,
+                w,
+                b,
+                stride=[stride],
+                padding=[padding],
+                dilation=[dilation],
+                groups=groups,
+            )
+            got = conv1d_to_conv2d(
+                x,
+                w,
+                b,
+                stride=[stride],
+                padding=[padding],
+                dilation=[dilation],
+                groups=groups,
+            )
+            self.assertTrue(torch.allclose(ref, got, atol=1e-5, rtol=1e-5))
+
+        # A few cases
+        check_case()  # default
+        check_case(stride=1, padding=0, K=3)
+        check_case(stride=3, padding=4, K=7)
+        check_case(dilation=2, padding=6, K=5)  # dilation
+        check_case(groups=1, C_in=8, C_out=12)  # groups=1 bigger
+        check_case(groups=2, C_in=8, C_out=12)  # grouped conv
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_dlpack.py b/test/test_dlpack.py
index b960575cc6348..669a910cb3aeb 100644
--- a/test/test_dlpack.py
+++ b/test/test_dlpack.py
@@ -93,15 +93,7 @@ def test_dlpack_shared_storage(self, device):
         z[0] = z[0] + 20.0
         self.assertEqual(z, x)
 
-    @skipMeta
-    @onlyCUDA
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
-    def test_dlpack_conversion_with_streams(self, device, dtype):
-        # Create a stream where the tensor will reside
-        stream = torch.cuda.Stream()
-        with torch.cuda.stream(stream):
-            # Do an operation in the actual stream
-            x = make_tensor((5,), dtype=dtype, device=device) + 1
+    def _dlpack_conversion_with_streams(self, stream, x):
         # DLPack protocol helps establish a correct stream order
         # (hence data dependency) at the exchange boundary.
         # DLPack manages this synchronization for us, so we don't need to
@@ -114,8 +106,38 @@ def test_dlpack_conversion_with_streams(self, device, dtype):
         with torch.cuda.stream(stream):
             z = from_dlpack(x)
         stream.synchronize()
+        return z
+
+    @skipMeta
+    @onlyCUDA
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    def test_dlpack_conversion_with_streams(self, device, dtype):
+        # Create a stream where the tensor will reside
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            # Do an operation in the actual stream
+            x = make_tensor((5,), dtype=dtype, device=device) + 1
+        z = self._dlpack_conversion_with_streams(stream, x)
         self.assertEqual(z, x)
 
+    @skipMeta
+    @onlyCUDA
+    @dtypes(
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+        torch.float8_e8m0fnu,
+        torch.float4_e2m1fn_x2,
+    )
+    def test_dlpack_conversion_with_streams_narrow_precision(self, device, dtype):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            x = make_tensor((5,), dtype=torch.uint8, device=device) + 1
+            x = x.view(dtype)
+        z = self._dlpack_conversion_with_streams(stream, x)
+        self.assertEqual(z.view(torch.uint8), x.view(torch.uint8))
+
     @skipMeta
     @onlyNativeDeviceTypes
     @dtypes(
@@ -187,6 +209,27 @@ def test_dlpack_conversion_with_diff_streams(self, device, dtype):
         stream_b.synchronize()
         self.assertEqual(z, x)
 
+    @skipMeta
+    @onlyCUDA
+    @dtypes(
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+        torch.float8_e8m0fnu,
+        torch.float4_e2m1fn_x2,
+    )
+    def test_dlpack_conversion_with_diff_streams_narrow_precision(self, device, dtype):
+        stream_a = torch.cuda.Stream()
+        stream_b = torch.cuda.Stream()
+        with torch.cuda.stream(stream_a):
+            x = make_tensor((5,), dtype=torch.uint8, device=device) + 1
+            x = x.view(dtype)
+            z = torch.from_dlpack(x.__dlpack__(stream=stream_b.cuda_stream))
+            stream_a.synchronize()
+        stream_b.synchronize()
+        self.assertEqual(z.view(torch.uint8), x.view(torch.uint8))
+
     @skipMeta
     @onlyNativeDeviceTypes
     @dtypes(
@@ -484,9 +527,7 @@ def test_unsupported_device_error(self, device):
     @skipMeta
     @onlyCPU
     def test_dlpack_unsupported_dtype_error(self, device):
-        inp = make_tensor((5,), dtype=torch.float32, device=device).to(
-            torch.float8_e4m3fn
-        )
+        inp = torch.quantize_per_tensor(torch.randn(()), 0.1, 10, torch.qint8)
 
         with self.assertRaisesRegex(
             BufferError, ".* types are not supported by dlpack"
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 7ba466119da85..3fee860a79800 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -1818,6 +1818,96 @@ def test_stride_symnode(self):
         self.assertTrue(isinstance(s3, int))
         self.assertTrue(str(s1.node.expr) != str(s2.node.expr))
 
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @parametrize("backend", ["inductor", "eager"])
+    def test_dynamic_int_basic_compile(self, backend):
+        from torch.fx.experimental.sym_node import DynamicInt
+
+        cnt = CompileCounterWithBackend(backend)
+
+        # test scalar inputs to function
+        def f(x, y, z):
+            out = torch.tensor([x + y + z])
+            out = out + torch.zeros(abs(x) + 2).sum()  # test out tensor construction
+            return out
+
+        fn = torch.compile(f, fullgraph=True, backend=cnt)
+        x = DynamicInt(1)
+        z = DynamicInt(3)
+        self.assertEqual(fn(x, x, z), f(1, 1, 3))  # guard: x == y
+        self.assertEqual(fn(2, 2, 0), f(2, 2, 0))
+        self.assertEqual(fn(-1, -1, 2), f(-1, -1, 2))
+        self.assertEqual(cnt.frame_count, 1)  # no recompiles
+
+        self.assertEqual(fn(3, 4, 5), f(3, 4, 5))  # now we recompile
+        self.assertEqual(cnt.frame_count, 2)
+
+        # test nn module property
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.i = DynamicInt(1)
+
+            def forward(self, x):
+                return torch.tensor([x + self.i])
+
+        cnt.clear()
+        m = Foo()
+        mc = torch.compile(m, backend=cnt, fullgraph=True)
+
+        self.assertEqual(mc(DynamicInt(0)), m(0))
+        mc.i = -2  # override attribute
+        self.assertEqual(mc(-1), m(-1))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_dynamic_int_eager_usage(self):
+        from torch.fx.experimental.sym_node import DynamicInt
+
+        w = DynamicInt(-1)
+        x = DynamicInt(0)
+        y = DynamicInt(1)
+        z = DynamicInt(2)
+
+        def check(l, r):
+            self.assertTrue(isinstance(l, DynamicInt))
+            self.assertEqual(l, r)
+
+        # test arithmetic
+        check(2 * y + z, 4)
+        check((10 - z) // 2, 4)
+        check(1 // z, 0)
+        check(-w + w**2, 2)
+        check(x % z, 0)
+        check(1 << z, 4)
+        check(z | y, 3)
+        check(min(y, z), 1)
+        self.assertTrue(z > -2)
+        with self.assertRaises(ZeroDivisionError):
+            y % x
+
+        # math, numpy
+        self.assertEqual(math.cos(x), y)
+        self.assertEqual(math.prod([z, z], start=z), 8)
+        self.assertEqual(np.arange(z)[y], 1)
+        self.assertTrue(np.allclose(np.ones([y, z]).sum(axis=x), np.ones(z)))
+
+        # test conversions
+        self.assertTrue(isinstance(x + 2, int))
+        self.assertTrue(isinstance(x + 2, DynamicInt))
+        self.assertEqual(y / 2.0, 0.5)  # this could return DynamicFloat in future
+        self.assertEqual(float(z), 2.0)
+        self.assertFalse(bool(x))
+        self.assertEqual(DynamicInt(x).real, x.real)
+
+        # torch functions, scalar inputs
+        self.assertEqual(torch.arange(z)[:w][x], 0)
+        self.assertEqual(torch.add(torch.tensor(w), torch.tensor(w), alpha=z), -3)
+        self.assertEqual(
+            list(torch.nn.Linear(z, y)(torch.randn(z * 2, z)).shape), [4, 1]
+        )
+        self.assertEqual(z * torch.ones(z).sum(dim=x), 4)
+
 
 instantiate_parametrized_tests(TestSymNumberMagicMethods)
 
@@ -3685,6 +3775,21 @@ def f(idx, x):
         out = torch.compile(f)(idx, x)
         self.assertEqual(out, f(idx, x))
 
+    def test_trunc_int_div_true(self):
+        @torch.compile(backend="inductor", dynamic=True, fullgraph=True)
+        def f(x, s13, s57, s77):
+            torch._check(s13 >= 0)
+            torch._check(s57 >= 0)
+            torch._check(s77 >= 0)
+            if int(s13 * ((s57 // s13) + (s77 // s13)) / s13) >= 1:
+                return x * 2
+            else:
+                return x * 100
+
+        # ensure we compile this with no errors.
+        x = torch.rand(10)
+        f(x, 4, 4096, 3920)
+
 
 instantiate_parametrized_tests(TestUnbacked)
 
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 83c6f383f430d..7cdeb0f8d0305 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -214,7 +214,7 @@ def test_zero_dim(self):
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_op_with_zero_dim_bypassed(self):
         if torch._functorch.config.fake_tensor_propagate_real_tensors:
-            return
+            self.skipTest("Propagate real tensor not supported")
         shape_env = ShapeEnv()
         mode = FakeTensorMode(shape_env=shape_env)
         x = torch.tensor(1.0, device="cuda")
@@ -1516,14 +1516,61 @@ def test_fake_gpu_no_init(self):
         # Skip this test, we will try to run CUDA operations to real prop so
         # it clearly will not work on CPU runner
         if torch._functorch.config.fake_tensor_propagate_real_tensors:
-            return
-        with FakeTensorMode():
-            torch.empty(10, device=GPU_TYPE)
-            torch.ones(10, device=GPU_TYPE)
-            torch.zeros(10, device=GPU_TYPE)
-            torch.rand(10, device=GPU_TYPE)
-            torch.tensor(3.14, device=GPU_TYPE)
-            torch.tensor([[3.14, 2], [1, 2]], device=GPU_TYPE)
+            self.skipTest("Propagate real tensor not supported")
+
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            self.assertEqual(torch.empty(10, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(torch.ones(10, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(torch.zeros(10, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(torch.rand(10, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(torch.tensor(3.14, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(
+                torch.tensor([[3.14, 2], [1, 2]], device=GPU_TYPE).device.type, GPU_TYPE
+            )
+
+    @unittest.skipIf(not torch.backends.cuda.is_built(), "requires CUDA build")
+    def test_move_module_under_fake(self):
+        if torch._functorch.config.fake_tensor_propagate_real_tensors:
+            self.skipTest("Propagate real tensor not supported")
+
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+                self.buffer = torch.nn.Buffer(torch.rand(2, 2))
+                self.param = torch.nn.Parameter(torch.rand(2, 2))
+
+            def forward(self, x):
+                return self.linear(x) + self.buffer + self.param
+
+        m = Module()
+        input = torch.rand(2, 2)
+        gpu_device = torch.device(GPU_TYPE, 0)
+
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            m.to(device=gpu_device)
+            arg = input.to(device=gpu_device)
+            out = m(arg)
+
+        for p in m.parameters():
+            self.assertTrue(isinstance(p, FakeTensor))
+            self.assertEqual(p.device, gpu_device)
+        for b in m.buffers():
+            self.assertTrue(isinstance(b, FakeTensor))
+            self.assertEqual(b.device, gpu_device)
+
+        self.assertTrue(isinstance(out, FakeTensor))
+        self.assertEqual(out.device, gpu_device)
+
+    @unittest.skipIf(not RUN_CUDA, "requires cuda")
+    def test_move_meta_tensor(self):
+        if torch._functorch.config.fake_tensor_propagate_real_tensors:
+            self.skipTest("Propagate real tensor not supported")
+
+        meta_tensor = torch.ones(2, device="meta")
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            self.assertEqual(meta_tensor.to(device="cpu").device.type, "cpu")
+            self.assertEqual(meta_tensor.to(device=GPU_TYPE).device.type, GPU_TYPE)
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_conv_c1_backward(self):
diff --git a/test/test_linalg.py b/test/test_linalg.py
index de738b645564f..3fb6c8696c00a 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -26,7 +26,7 @@
      runOnRocmArch, MI300_ARCH)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, has_cusolver, has_hipsolver,
-     onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
+     onlyCPU, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA,
      onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm, skipCUDAIfRocmVersionLessThan,
      dtypesIfMPS, largeTensorTest)
@@ -108,22 +108,6 @@ def get_tunableop_untuned_filename():
     return untuned_filename
 
 class TestLinalg(TestCase):
-    @contextlib.contextmanager
-    def _hip_allow_tf32(self):
-        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-        # and only for MI300+. Environment variable will be removed in the future.
-        import os
-        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-
-        try:
-            yield
-        finally:
-            if hip_allow_tf32 is not None:
-                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-            else:
-                del os.environ["HIPBLASLT_ALLOW_TF32"]
-
     def setUp(self):
         super(self.__class__, self).setUp()
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -620,15 +604,6 @@ def complement_device(device):
             with self.assertRaisesRegex(RuntimeError, r'parameter `driver` should be one of \(gels, gelsy, gelsd, gelss\)'):
                 torch.linalg.lstsq(a, b, driver='fictitious_driver')
 
-        # cuSOLVER path supports underdetermined systems
-        version = torch.testing._internal.common_cuda._get_torch_cuda_version()
-        cusolver_not_available = (version < (10, 1))
-
-        if device != 'cpu' and cusolver_not_available:
-            a = torch.rand(2, 3, dtype=dtype, device=device)
-            b = torch.rand(2, 1, dtype=dtype, device=device)
-            with self.assertRaisesRegex(RuntimeError, r'only overdetermined systems'):
-                torch.linalg.lstsq(a, b)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
@@ -5524,13 +5499,8 @@ def test_scaled_gemm_tunableop(self, device, dtype):
     @runOnRocmArch(MI300_ARCH)
     @dtypes(torch.float)
     def test_tf32_tunableop(self, device, dtype):
-        # Test TunableOp with TF32. Supported by hipblasLT on MI300+.
-        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-        # and only for MI300+. Eventually this flag will go away.
-        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-
         try:
-            with self._tunableop_ctx(), tf32_ctx():
+            with self._tunableop_ctx():
                 torch.backends.cuda.matmul.allow_tf32 = True
                 torch.cuda.tunable.set_rotating_buffer_size(0)
 
@@ -5593,13 +5563,8 @@ def test_tf32_offline_tunableop(self, device, dtype):
         # This test is the offline version of test_tf32_tunableop
         import os
 
-        # Test TunableOp with TF32. Supported by hipblasLT on MI300+.
-        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-        # and only for MI300+. Eventually this flag will go away.
-        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-
         try:
-            with self._tunableop_ctx(), tf32_ctx():
+            with self._tunableop_ctx():
                 torch.backends.cuda.matmul.allow_tf32 = True
                 ordinal = torch.cuda.current_device()
                 torch.cuda.tunable.set_rotating_buffer_size(0)
@@ -7112,7 +7077,7 @@ def _test(m, k, n, transpose_a, transpose_b, test_equal=True):
 
         if TEST_WITH_ROCM:
             _test(17, k, n, use_transpose_a, use_transpose_b, True)
-        elif version >= (11, 7):
+        else:
             if not use_transpose_a and use_transpose_b:
                 if SM80OrLater or (version >= (12, 3) and (SM70 or SM75)):
                     _test(17, k, n, use_transpose_a, use_transpose_b, version > (11, 7))
@@ -7138,18 +7103,12 @@ def _test(m, k, n, transpose_a, transpose_b, test_equal=True):
                     with self.assertRaisesRegex(RuntimeError,
                                                 "CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasLtMatmul"):
                         _test(17, k, n, use_transpose_a, use_transpose_b)
-        else:
-            with self.assertRaisesRegex(RuntimeError, "_int_mm_out_cuda not compiled for CUDA"):
-                _test(17, k, n, use_transpose_a, use_transpose_b, False)
 
     @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @skipCUDAIfRocmVersionLessThan((6, 0))
     @onlyCUDA
     def test__int_mm_errors(self, device):
-        version = _get_torch_cuda_version()
-        if torch.version.cuda and version < (11, 7):
-            self.skipTest("_int_mm only compiled for CUDA 11.7")
 
         def genf_int(x, y):
             return torch.empty((x, y), dtype=torch.int8, device=device)
@@ -9352,7 +9311,6 @@ def symmetric(A):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @skipCUDAIfRocm
-    @skipCUDAIf(_get_torch_cuda_version() < (11, 4), "not available before CUDA 11.3.1")
     @dtypes(*floating_and_complex_types())
     def test_ldl_solve(self, device, dtype):
         from torch.testing._internal.common_utils import random_hermitian_pd_matrix
@@ -9527,6 +9485,39 @@ def test_matmul_mv(self, device, dtype):
         C = torch.matmul(A, B)
         self.assertEqual(C, B.sum().expand(B.shape))
 
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
+    def test_triu_tril_extreme_k_values(self, device, dtype):
+        """
+        Test triu/tril with extreme k values to verify overflow fix.
+        Regression test for https://github.com/pytorch/pytorch/pull/153240
+        """
+        # Create test matrices
+        a = make_tensor((5, 5), dtype=dtype, device=device)
+
+        # Test extreme positive k value
+        k_max = 9223372036854775807
+        result_triu_max = torch.triu(a, k_max)
+        result_tril_max = torch.tril(a, k_max)
+
+        # With k = INT64_MAX, triu should return all zeros (since i + k will exceed matrix bounds for all i,j)
+        # and tril should return the full matrix (since i + k + 1 will exceed matrix bounds for all i,j)
+        expected_triu_max = torch.zeros_like(a)
+        expected_tril_max = a.clone()
+        self.assertEqual(result_triu_max, expected_triu_max)
+        self.assertEqual(result_tril_max, expected_tril_max)
+
+        # Test extreme negative k value
+        k_min = -9223372036854775808
+        result_triu_min = torch.triu(a, k_min)
+        result_tril_min = torch.tril(a, k_min)
+
+        # With k = INT64_MIN, triu should return the full matrix (since i + k will be negative for all i,j)
+        # and tril should return all zeros (since i + k + 1 will be negative for all i,j)
+        expected_triu_min = a.clone()
+        expected_tril_min = torch.zeros_like(a)
+        self.assertEqual(result_triu_min, expected_triu_min)
+        self.assertEqual(result_tril_min, expected_tril_min)
+
     @dtypes(torch.float, torch.double)
     @precisionOverride({torch.float32: 1e-4})
     def test_1_sized_with_0_strided(self, device, dtype):
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index f1e9ee69b1a42..fd7fa2956c347 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -24,7 +24,7 @@
     SM80OrLater,
     SM89OrLater,
     SM90OrLater,
-    xfailIfSM100OrLater,
+    SM100OrLater,
     xfailIfSM120OrLater,
     _get_torch_cuda_version,
     PLATFORM_SUPPORTS_FP8,
@@ -57,6 +57,7 @@
     TEST_CUDA,
     TEST_WITH_ROCM,
     TestCase,
+    decorateIf,
 )
 from torch.testing._internal.common_quantized import (
     _f32_to_floatx_unpacked,
@@ -66,6 +67,8 @@
     generate_jagged_offs,
 )
 
+from torch._inductor.test_case import TestCase as InductorTestCase
+
 _IS_SM8X = False
 if TEST_CUDA:
     _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8
@@ -73,6 +76,16 @@
 # Protects against includes accidentally setting the default dtype
 assert torch.get_default_dtype() is torch.float32
 
+def xfailIfSM100OrLaterAndCondition(condition_fn):
+    """
+    Conditionally xfail tests on SM100+ based on a condition function.
+    The condition function receives the test parameters dict and returns True to xfail.
+    """
+    return decorateIf(
+        unittest.expectedFailure,
+        lambda params: SM100OrLater and condition_fn(params)
+    )
+
 
 @contextlib.contextmanager
 def blas_library_context(backend):
@@ -83,7 +96,7 @@ def blas_library_context(backend):
     finally:
         torch.backends.cuda.preferred_blas_library(prev_backend)
 
-class TestMatmulCuda(TestCase):
+class TestMatmulCuda(InductorTestCase):
     def setUp(self):
         super().setUp()
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -172,6 +185,7 @@ def test_cublas_addmm(self, size: int, dtype: torch.dtype, backend):
             self.cublas_addmm(size, dtype, False)
 
     @onlyCUDA
+    @xfailIfSM100OrLaterAndCondition(lambda params: params.get('dtype') == torch.bfloat16 and params.get('size') == 10000)
     @skipIfRocmVersionLessThan((5, 2))
     # imported 'tol' as 'xtol' to avoid aliasing in code above
     @toleranceOverride({torch.float16: xtol(atol=7e-1, rtol=2e-1),
@@ -321,7 +335,6 @@ def grouped_mm_helper(self, alist, blist, gOlist, agradlist, bgradlist, outlist)
                 self.assertEqual(agrad, a.grad)
                 self.assertEqual(bgrad, b.grad)
 
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
     @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
@@ -360,7 +373,6 @@ def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major, dtype):
             start = offs_cpu[i]
         self.grouped_mm_helper(alist, blist, gO, agradlist, bgradlist, out)
 
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
     @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
@@ -417,7 +429,6 @@ def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major, dtype):
             self.grouped_mm_helper(alist, b, gOlist, agradlist, bgradlist, outlist)
 
 
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
     @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
@@ -452,7 +463,6 @@ def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major, dtype):
         out.backward(gO)
         self.grouped_mm_helper(a, b, gO, a.grad, b.grad, out)
 
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
     @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
@@ -460,6 +470,8 @@ def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major, dtype):
     @parametrize("b_row_major", [False, True])
     @dtypes(torch.bfloat16, torch.float32, torch.float16)
     def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
+        if TEST_WITH_ROCM and a_row_major and b_row_major and dtype in [torch.bfloat16, torch.float16]:
+            self.skipTest("failed using hipblaslt on rocm 6.4.2")
         device = "cuda"
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
@@ -506,16 +518,14 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
             self.grouped_mm_helper(a, blist, gOlist, agradlist, bgradlist, outlist)
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
-    @xfailIfSM100OrLater
     # TODO(future PR): enable compile for torch._grouped_mm fallback path
     @unittest.skipIf(not SM90OrLater, "Grouped gemm with compile supported on SM90")
+    @unittest.skipIf(SM100OrLater, "Grouped gemm is inconsistently raising numeric issues see: #163462 ")
     @parametrize("op", ["2d/2d", "2d/3d", "3d/2d", "3d/3d"])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
     @parametrize("max_autotune", [False, True])
     def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune):
-        torch._dynamo.reset()
-
         device = "cuda"
         dtype_AB = torch.bfloat16
         dtype_offset = torch.int32
@@ -631,8 +641,7 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
     @parametrize("N", [1, 32, 64])
     @parametrize("K", [1, 32, 64])
     @parametrize("batch_size", [None, 1, 16])
-    # TODO: enable rocblas path on ROCm
-    @parametrize("backend", ["cublaslt"] if torch.version.hip else ["cublas", "cublaslt"])
+    @parametrize("backend", ["cublas", "cublaslt"])
     def test_mm_bmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
         device = "cuda"
         dtype = input_dtype
@@ -686,8 +695,7 @@ def create_inputs(B=None):
     @parametrize("N", [1, 32, 64])
     @parametrize("K", [1, 32, 64])
     @parametrize("batch_size", [None, 1, 32])
-    # TODO: enable rocblas path on ROCm
-    @parametrize("backend", ["cublaslt"] if torch.version.hip else ["cublas", "cublaslt"])
+    @parametrize("backend", ["cublas", "cublaslt"])
     def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
         device = "cuda"
         dtype = input_dtype
@@ -935,7 +943,7 @@ def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # largest power of 2 representable in `torch.float8_e4m3fn`
 F8E4M3_LARGEST_POW2 = 8
 # largest power of 2 representable in `torch.float4_e2m1fn_x2`
-FP4E2M1FN_LARGEST_POW2 = 1.0
+FP4E2M1FN_LARGEST_POW2 = 2.0
 # max value of `torch.float8_e4m3fn` (448)
 F8E4M3_MAX_VAL = torch.finfo(torch.float8_e4m3fn).max
 # exponent bias of `torch.float8_e8m0fnu`
@@ -1540,22 +1548,34 @@ def test_scaled_mm_vs_emulated_row_wise(self, base_dtype):
         x_fp8 = to_fp8_saturated(x * x_scales, e4m3_type)
         y_fp8 = to_fp8_saturated(y * y_scales, e4m3_type)
 
-        # Calculate actual F8 mm
-        out_scaled_mm = mm_float8(
-            x_fp8, y_fp8, a_scale=x_scales, b_scale=y_scales, output_dtype=output_dtype
-        )
+        def test():
+            # Calculate actual F8 mm
+            out_scaled_mm = mm_float8(
+                x_fp8, y_fp8, a_scale=x_scales, b_scale=y_scales, output_dtype=output_dtype
+            )
 
-        # Calculate emulated F8 mm
-        out_emulated = mm_float8_emulated(
-            x_fp8, x_scales, y_fp8, y_scales, output_dtype
-        )
+            # Calculate emulated F8 mm
+            out_emulated = mm_float8_emulated(
+                x_fp8, x_scales, y_fp8, y_scales, output_dtype
+            )
 
-        if base_dtype in {torch.bfloat16, torch.float16}:
-            atol, rtol = 7e-2, 7e-2
-        else:
-            atol, rtol = 2e-3, 2e-3
+            if base_dtype in {torch.bfloat16, torch.float16}:
+                atol, rtol = 7e-2, 7e-2
+            else:
+                atol, rtol = 2e-3, 2e-3
 
-        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+            self.assertEqual(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+        # only cuBLAS supports rowwise with fp32 output and cuBLAS only supports
+        # rowwise on SM 9.0
+        if torch.cuda.get_device_capability() != (9, 0) and output_dtype == torch.float:
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Only bf16 high precision output types are supported for row-wise scaling."
+            ):
+                test()
+        else:
+            test()
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
     @unittest.skipIf(not IS_SM90, "cuBLAS blockwise scaling requires sm90+")
@@ -1756,8 +1776,12 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
 
         device = "cuda"
         M, K, N = mkn
-        if (recipe == "nvfp4" or recipe == "mxfp4") and K % 32 != 0:
-            raise unittest.SkipTest("K must be divisible by 32 for nvfp4/mxfp4 cublas gemm, skipping")
+        if recipe == "nvfp4" and K % 32 != 0:
+            raise unittest.SkipTest("K must be divisible by 32 for nvfp4 cublas gemm, skipping")
+
+        if torch.version.hip:
+            if not (M % 16 == 0 and K % 128 == 0 and N % 16 == 0):
+                raise unittest.SkipTest("M and N must be multiples of 16 and K must be multiple of 128 on ROCm, skipping")
 
         fp4_scaling_dtype = torch.float8_e8m0fnu if torch.version.hip else torch.float8_e4m3fn
         BLOCK_SIZE = 32 if torch.version.hip else (16 if recipe == "nvfp4" else 32)
@@ -1922,9 +1946,12 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
                 B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
                 B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
             else:  # nvfp4 # mxfp4
-                scale_func = data_to_mx_scale if recipe == "mxfp4" else data_to_nvfp4_scale
-                A_scale = scale_func(*([A_ref, BLOCK_SIZE] + recipe if recipe == "mxfp4" else [A_ref, BLOCK_SIZE]))
-                B_scale = scale_func(*([B_ref, BLOCK_SIZE] + recipe if recipe == "mxfp4" else [B_ref, BLOCK_SIZE]))
+                if recipe == "mxfp4":
+                    A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE, recipe)
+                    B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE, recipe)
+                else:
+                    A_scale = data_to_nvfp4_scale(A_ref, BLOCK_SIZE)
+                    B_scale = data_to_nvfp4_scale(B_ref, BLOCK_SIZE)
                 max_val = FP4_MAX_VAL
                 min_val = -1 * max_val
 
@@ -1935,7 +1962,7 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
                 B = B.clamp(min=min_val, max=max_val)
                 B = _bfloat16_to_float4_e2m1fn_x2(B)
 
-                approx_match_sqnr_target = 12.0 if torch.version.hip else 15.8
+                approx_match_sqnr_target = 15 if torch.version.hip else 15.8
 
         C_ref = A_ref @ B_ref.t()
 
diff --git a/test/test_meta.py b/test/test_meta.py
index b3e5faab4f659..4e79e59cfe62a 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -1827,6 +1827,43 @@ def test_stride_for_index_Tensor(self):
 
         self.assertEqual(out.stride(), f_out.stride())
 
+
+    @parametrize("in_dtype", [torch.float32, torch.float16])
+    @parametrize("bias_dtype", [torch.float32, torch.float16, None])
+    def test_mixed_dtype_for_native_layer_norm_backward(self, in_dtype, bias_dtype):
+        if in_dtype == torch.float16 and bias_dtype == torch.float32:
+            self.skipTest(f"not supported input dtype is {in_dtype} and bias dtype is {bias_dtype}")
+        device = "meta"
+
+        def fn(input, weight, bias, need_grad_input):
+            outputs = torch.nn.functional.layer_norm(input, input.shape[-1:], weight, bias)
+            grad_outs = torch.ones_like(outputs)
+            grad_ins = torch.autograd.grad(outputs, need_grad_input, grad_outs)
+            return grad_ins
+
+        input = torch.randn([4, 8, 5], dtype=in_dtype, device=device, requires_grad=True)
+        need_grad_input = [input]
+
+        if bias_dtype:
+            weight = torch.randn(
+                [5], dtype=bias_dtype, device=device, requires_grad=True
+            )
+            bias = torch.randn(
+                [5], dtype=bias_dtype, device=device, requires_grad=True
+            )
+            need_grad_input.append(weight)
+            need_grad_input.append(bias)
+        else:
+            weight = None
+            bias = None
+
+        outs = fn(input, weight, bias, need_grad_input)
+        out_dtype = [t.dtype for t in outs]
+        if bias_dtype:
+            self.assertEqual(out_dtype, [in_dtype, bias_dtype, bias_dtype])
+        else:
+            self.assertEqual(out_dtype, [in_dtype,])
+
 instantiate_device_type_tests(TestMeta, globals())
 
 def print_op_str_if_not_supported(op_str):
diff --git a/test/test_model_exports_to_core_aten.py b/test/test_model_exports_to_core_aten.py
index 3d1c25939ec4b..60ec7ec54daf9 100644
--- a/test/test_model_exports_to_core_aten.py
+++ b/test/test_model_exports_to_core_aten.py
@@ -27,9 +27,7 @@ def test_vit_aten_export(self):
         m = m.eval()
         input_shape = (1, 3, 224, 224)
         example_inputs = (torch.randn(input_shape),)
-        m = torch.export.export_for_training(
-            m, copy.deepcopy(example_inputs), strict=True
-        ).module()
+        m = torch.export.export(m, copy.deepcopy(example_inputs), strict=True).module()
         m(*example_inputs)
         m = export.export(m, copy.deepcopy(example_inputs))
         ops = _get_ops_list(m.graph_module)
diff --git a/test/test_modules.py b/test/test_modules.py
index 86e780dd6eedc..e587c67815c72 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -15,10 +15,16 @@
 from torch.testing._internal.common_modules import module_db, modules, ModuleErrorEnum, TrainEvalMode
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck,
-    gradgradcheck, parametrize, wrapSwapTensorsTest)
+    gradgradcheck, parametrize, wrapSwapTensorsTest, TEST_WITH_ROCM)
 from unittest.mock import patch, call
 
 
+if TEST_WITH_ROCM:
+    import os
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM"] = "1"
+
+
 class TestModule(TestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
diff --git a/test/test_mps.py b/test/test_mps.py
index 756b2cd20567a..1a8f7af83e3fa 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -6940,6 +6940,70 @@ def helper(value, dim, index, idx_dtype=torch.int32):
         with self.assertRaisesRegex(RuntimeError, "Index to scalar can have only 1 value"):
             helper(22, 0, [])
 
+    # TODO: This test can be removed once the backward pass of embedding_bag is
+    # implemented and tested
+    @parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @parametrize("idx_dtype", [torch.long, torch.int])
+    @parametrize("padding_idx", [-1, 1])
+    @parametrize("include_last_offset", [True, False])
+    @parametrize("mode", ['sum', 'mean', 'max'])
+    def test__embedding_bag(self, dtype, idx_dtype, padding_idx, include_last_offset, mode):
+        import time
+        torch.manual_seed(time.time() * 1000)
+        mode_num = {'sum': 0, 'mean': 1, 'max': 2}[mode]
+        num_words = 10
+        feature_size = 7
+        num_indices = 40
+        num_bags = 5
+
+        weight_cpu = torch.randn(num_words, feature_size, dtype=dtype)
+
+        # Test nan value behavior.
+        # Set second element of each word to nan.
+        weight_cpu[:, 1] = float('nan')
+        # Set third element of a randomized half of the words to nan.
+        weight_cpu[torch.randperm(num_words)[:num_words // 2], 2] = float('nan')
+        # Set fourth element of one randomized word to nan.
+        weight_cpu[torch.randint(0, num_words, ()), 3] = float('nan')
+
+        input_cpu = torch.randint(0, num_words, (num_indices,), dtype=idx_dtype)
+        offsets_cpu = torch.tensor(
+            [0] + (torch.randperm(num_indices - 1)[:num_bags - 1].sort()[0] + 1).tolist(),
+            dtype=idx_dtype)
+
+        if include_last_offset:
+            offsets_cpu[-1] = input_cpu.numel()
+
+        per_sample_weights_cpu = torch.randn(num_indices, dtype=dtype) if mode == 'sum' else None
+
+        r_cpu, offset2bag_cpu, bag_size_cpu, max_indices_cpu = torch._embedding_bag(
+            weight_cpu,
+            input_cpu,
+            offsets_cpu,
+            per_sample_weights=per_sample_weights_cpu,
+            mode=mode_num,
+            padding_idx=padding_idx,
+            include_last_offset=include_last_offset,
+        )
+        r_mps, offset2bag_mps, bag_size_mps, max_indices_mps = torch._embedding_bag(
+            weight_cpu.to('mps'),
+            input_cpu.to('mps'),
+            offsets_cpu.to('mps'),
+            per_sample_weights=per_sample_weights_cpu.to('mps') if per_sample_weights_cpu is not None else None,
+            mode=mode_num,
+            padding_idx=padding_idx,
+            include_last_offset=include_last_offset,
+        )
+
+        self.assertEqual(r_cpu, r_mps)
+
+        if mode != 'sum':
+            self.assertEqual(offset2bag_cpu, offset2bag_mps)
+            self.assertEqual(bag_size_cpu, bag_size_mps)
+
+        if mode == 'max':
+            self.assertEqual(max_indices_cpu, max_indices_mps)
+
     def test_embedding_dense_backward(self):
         def helper(n, d, m, idx):
             embeddingMPS = nn.Embedding(n, d, max_norm=True, device='mps')
@@ -11628,6 +11692,9 @@ def test_empty_slice(self, device="mps"):
     def test_empty_reduce(self, device="mps"):
         x = torch.rand(0, 3, device=device)
         self.assertTrue(x.mean().isnan())
+        self.assertTrue(x.nanmean().isnan())
+        self.assertTrue(x.median().isnan())
+        self.assertTrue(x.nanmedian().isnan())
         self.assertEqual(x.count_nonzero(), 0)
         self.assertEqual(x.sum(), 0)
         self.assertEqual(x.nansum(), 0)
@@ -12231,7 +12298,7 @@ class TestConsistency(TestCaseMPS):
         'arange', 'linspace',
         'special.xlog1py',
 
-        # CPU accumulates sequantially, but GPU does in in parallel
+        # CPU accumulates sequantially, but GPU does in parallel
         '_unsafe_masked_index_put_accumulate',
     }
 
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index ac97f2beda8e8..461d66c7ce3a8 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -26,7 +26,6 @@
     NestedTensor,
     ViewNestedFromBuffer,
 )
-from torch.nn.attention.flex_attention import create_nested_block_mask, flex_attention
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FUSED_ATTENTION,
     SM70OrLater,
@@ -36,7 +35,6 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
-    flex_attention_supported_platform,
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
@@ -60,7 +58,6 @@
     parametrize,
     run_tests,
     serialTest,
-    skipIfRocm,
     skipIfSlowGradcheckEnv,
     skipIfTorchDynamo,
     subtest,
@@ -1230,6 +1227,24 @@ def test_device_checks(self, device):
         is_cuda = "cuda" in str(device)
         self.assertEqual(nt.is_cuda, is_cuda)
 
+    @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+    def test_share_memory(self, device):
+        a = torch.randn(3, 4, device=device)
+        b = torch.randn(5, 4, device=device)
+        nt = torch.nested.nested_tensor([a, b], layout=torch.jagged)
+
+        # Guard CUDA tensors
+        if "cuda" in device:
+            result = nt.share_memory_()
+            self.assertIs(result, nt)
+            return
+
+        result = nt.share_memory_()
+        self.assertIs(result, nt)
+
+        # Verify in shared memory
+        self.assertTrue(nt.is_shared())
+
     @dtypes(torch.float, torch.float16, torch.double)
     def test_nested_tensor_indexing(self, device, dtype):
         # edge case: empty nested tensor
@@ -1319,6 +1334,82 @@ def test_unary_funcs(self, device, func):
             lambda: func(nt_noncontiguous),
         )
 
+    def test_is_any_true_jagged(self, device):
+        B, Fin = 2, 6
+        start = torch.zeros(B, dtype=torch.int64, device=device)
+        lengths = torch.tensor([3, 2], dtype=torch.int64, device=device)
+
+        # NestedTensor reduction should operate on same data as .values().
+        with self.subTest("dispatch_matches_values_buffer"):
+            cond = torch.tensor(
+                [
+                    [True, False, False, True, True, False],
+                    [False, False, True, False, False, False],
+                ],
+                dtype=torch.bool,
+                device=device,
+            )
+            nt = torch.nested.narrow(
+                cond, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            out_nt = torch.ops.aten._is_any_true.default(nt).item()
+            out_vals = torch.ops.aten._is_any_true.default(nt.values()).item()
+            self.assertEqual(out_nt, out_vals)
+
+        # Verify jagged boolean behavior.
+        with self.subTest("all_false_returns_false"):
+            cond_false = torch.zeros(B, Fin, dtype=torch.bool, device=device)
+            nt_false = torch.nested.narrow(
+                cond_false, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            self.assertFalse(torch.ops.aten._is_any_true.default(nt_false).item())
+
+        with self.subTest("one_true_returns_true"):
+            cond_mixed = torch.zeros(B, Fin, dtype=torch.bool, device=device)
+            cond_mixed[0, 0] = True
+            nt_mixed = torch.nested.narrow(
+                cond_mixed, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            self.assertTrue(torch.ops.aten._is_any_true.default(nt_mixed).item())
+
+    def test_is_all_true_jagged(self, device):
+        B, Fin = 2, 6
+        start = torch.zeros(B, dtype=torch.int64, device=device)
+        lengths = torch.tensor([3, 2], dtype=torch.int64, device=device)
+
+        # NestedTensor reduction should operate on same data as .values().
+        with self.subTest("dispatch_matches_values_buffer"):
+            cond = torch.tensor(
+                [
+                    [True, True, True, False, False, False],
+                    [True, True, False, False, False, False],
+                ],
+                dtype=torch.bool,
+                device=device,
+            )
+            nt = torch.nested.narrow(
+                cond, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            out_nt = torch.ops.aten._is_all_true.default(nt).item()
+            out_vals = torch.ops.aten._is_all_true.default(nt.values()).item()
+            self.assertEqual(out_nt, out_vals)
+
+        # Verify jagged boolean behavior.
+        with self.subTest("all_true_returns_true"):
+            cond_true = torch.ones(B, Fin, dtype=torch.bool, device=device)
+            nt_true = torch.nested.narrow(
+                cond_true, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            self.assertTrue(torch.ops.aten._is_all_true.default(nt_true).item())
+
+        with self.subTest("any_false_returns_false"):
+            cond_mixed = torch.ones(B, Fin, dtype=torch.bool, device=device)
+            cond_mixed[0, 1] = False
+            nt_mixed = torch.nested.narrow(
+                cond_mixed, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            self.assertFalse(torch.ops.aten._is_all_true.default(nt_mixed).item())
+
     @parametrize("func", [subtest(torch.ge, name="ge"), subtest(torch.eq, name="eq")])
     def test_binary_ops_with_scalar(self, device, func):
         nt_contiguous, nt_noncontiguous = random_nt_noncontiguous_pair(
@@ -7285,124 +7376,6 @@ def _rand_nt(noncontig_with_holes=noncontig_with_holes):
 
         return query, key, value
 
-    @unittest.skip(
-        "Temporarily skip - nested tensor backward pass broken after return-max-scores commit"
-    )
-    @onlyCUDA
-    @flex_attention_supported_platform
-    @dtypes(torch.float32)
-    # non-contiguous with holes not supported yet
-    @decorateIf(unittest.skip, lambda params: params["noncontig_with_holes"])
-    @parametrize("noncontig_with_holes", [False, True])
-    @parametrize("cross_attention", [False, True])
-    @skipIfRocm
-    def test_flex_attention(self, device, dtype, noncontig_with_holes, cross_attention):
-        query, key, value = self._rand_qkv(
-            device, dtype, noncontig_with_holes, q_and_kv_match=(not cross_attention)
-        )
-
-        # Run FlexAttention with a causal mask
-        def causal_mask(b, h, q_idx, kv_idx):
-            return q_idx >= kv_idx
-
-        if cross_attention:
-            block_mask = create_nested_block_mask(
-                causal_mask, 1, 1, query, key, _compile=True
-            )
-        else:
-            block_mask = create_nested_block_mask(
-                causal_mask, 1, 1, query, _compile=True
-            )
-
-        out_flex = flex_attention(query, key, value, block_mask=block_mask)
-        grad_out = torch.randn_like(out_flex)
-        grads_flex = torch.autograd.grad(
-            out_flex, inputs=(query, key, value), grad_outputs=(grad_out,)
-        )
-        flex_outs = [out_flex, *grads_flex]
-
-        # Run FlexAttention with a score_mod that represents causal attention
-        def causal_score_mod(score, b, h, q_idx, kv_idx):
-            return torch.where(q_idx >= kv_idx, score, float("-inf"))
-
-        out_flex2 = flex_attention(query, key, value, score_mod=causal_score_mod)
-        grads_flex2 = torch.autograd.grad(
-            out_flex2, inputs=(query, key, value), grad_outputs=(grad_out,)
-        )
-        flex_outs2 = [out_flex2, *grads_flex2]
-
-        # Run causal SDPA for comparison
-        out_sdpa = F.scaled_dot_product_attention(query, key, value, is_causal=True)
-        grads_sdpa = torch.autograd.grad(
-            out_sdpa, inputs=(query, key, value), grad_outputs=(grad_out,)
-        )
-        sdpa_outs = [out_sdpa, *grads_sdpa]
-
-        # Compare flex vs. SDPA output and grads
-        for flex, flex2, sdpa in zip(flex_outs, flex_outs2, sdpa_outs):
-            self.assertTrue(flex.is_nested and flex2.is_nested and sdpa.is_nested)
-            self.assertEqual(flex, sdpa, atol=1e-2, rtol=1e-2)
-            self.assertEqual(flex2, sdpa, atol=1e-2, rtol=1e-2)
-
-    @onlyCUDA
-    @flex_attention_supported_platform
-    @dtypes(torch.float32)
-    def test_flex_attention_converts_stacked_seq_indices(self, device, dtype):
-        # This test verifies that a score_mod function written to operate within
-        # NJT sequence index space, such as a lookup table, works correctly. This
-        # validates that FlexAttention properly converts indices within the
-        # "stacked sequence" space used for NJT -> sequence-relative indices.
-        query, key, value = self._rand_qkv(device, dtype)
-
-        # Test with score_mod
-        score_mod_table = torch.randn(query._max_seqlen, device=device, dtype=dtype)
-
-        def my_score_mod(score, b, h, q_idx, kv_idx):
-            return score_mod_table[q_idx]
-
-        flex_attention(query, key, value, score_mod=my_score_mod)
-
-        # Test with batch-specific score_mod
-        batch_size = query.size(0)
-        batch_table = torch.randn(batch_size, device=device, dtype=dtype)
-        # Keep score the same for batch index == 0
-        batch_table[0].zero_()
-
-        def batch_specific_score_mod(score, b, h, q_idx, kv_idx):
-            return score + batch_table[b]
-
-        def identity_score_mod(score, b, h, q_idx, kv_idx):
-            return score
-
-        output = flex_attention(query, key, value, score_mod=batch_specific_score_mod)
-        output_identity = flex_attention(
-            query, key, value, score_mod=identity_score_mod
-        )
-
-        # Guard against a bug where the batch index passed to score_mod is always b == 0.
-        # Output would be equivalent to applying an identity score_mod.
-        # See https://github.com/pytorch/pytorch/issues/143788
-        self.assertFalse(torch.allclose(output._values, output_identity._values))
-
-        # Test with mask_mod
-        mask_mod_table = score_mod_table > 0.0
-
-        def my_mask_mod(b, h, q_idx, kv_idx):
-            return mask_mod_table[q_idx]
-
-        def my_mask_mod2(b, h, q_idx, kv_idx):
-            return mask_mod_table[q_idx] & (b == 0)
-
-        block_mask = create_nested_block_mask(my_mask_mod, 1, 1, query, _compile=True)
-        output = flex_attention(query, key, value, block_mask=block_mask)
-
-        block_mask2 = create_nested_block_mask(my_mask_mod2, 1, 1, query, _compile=True)
-        output2 = flex_attention(query, key, value, block_mask=block_mask2)
-
-        # Guard against a bug where the batch index passed to mask_mod is always b == 0.
-        # See https://github.com/pytorch/pytorch/issues/143788
-        self.assertFalse(torch.allclose(output._values, output2._values))
-
     @dtypes(torch.float32)
     def test_apply_(self, device, dtype):
         nt = random_nt_from_dims(
diff --git a/test/test_nn.py b/test/test_nn.py
index 5949012773ec5..ffdd29081a2d8 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -62,6 +62,7 @@
 
 if TEST_WITH_ROCM:
     os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM"] = "1"
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -3514,7 +3515,6 @@ def test_cudnn_forward_exception(self):
             self.assertRaisesRegex(RuntimeError, re.escape("input.size(-1) must be equal to input_size"), rnn, x_wrong)
 
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
-    @skipIfRocm
     def test_cudnn_weight_format(self):
         rnns = [
             nn.LSTM(10, 20, batch_first=True),
@@ -3522,7 +3522,8 @@ def test_cudnn_weight_format(self):
             nn.GRU(10, 20, batch_first=True),
             nn.RNN(10, 20, batch_first=True)
         ]
-        first_warn = True
+        # ROCm RNN does not issue warning about single contig chunk of memory, so don't assert it
+        first_warn = False if torch.version.hip else True
         for rnn in rnns:
             rnn.cuda()
             input = torch.randn(5, 4, 10, requires_grad=True, device="cuda")
@@ -5198,22 +5199,33 @@ def test_batchnorm_nhwc_cuda(self):
         name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}"
     )
     def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
-        if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16":
-           self.skipTest("3D float16 NCHW train failed on CUDA and ROCm due to Native batchnorm accuracy issue SWDEV-541024")
+        if torch.version.cuda:
+            if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
+                                        "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16",
+                                        "test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                        "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16"):
+                self.skipTest("Failed on CUDA")
+
         if torch.version.hip:
-            if self._testMethodName in ("test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
-                                    "test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
-                                    "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16",
-                                    "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16"
-                                    ) and _get_torch_rocm_version() < (6, 4):
+            if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
+                                        "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16",
+                                        "test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                        "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16") \
+                    and _get_torch_rocm_version() < (6, 4):
                 # NCHW bfloat16 path uses native kernels for rocm<=6.3
-                # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
+                # train failed on rocm<=6.3 due to native accuracy issue
+                # https://github.com/pytorch/pytorch/issues/156513
                 self.skipTest("bfloat16 NHWC train failed on ROCm <= 6.3")
 
             if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16",
-                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16"
-                                        ) and _get_torch_rocm_version() >= (6, 4):
-                self.skipTest("bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600")
+                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16") \
+                    and _get_torch_rocm_version() >= (6, 4):
+                # https://github.com/pytorch/pytorch/issues/156513
+                self.skipTest("bfloat16 NCHW train failed due to native tolerance issue")
+
+            if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16":
+                self.skipTest("3D float16 NCHW train failed on ROCm")
 
         if dims == 3 and memory_format in ("NHWC", "NCHW"):
             memory_format = memory_format + "3D"
@@ -7128,6 +7140,14 @@ def test_unfold_invalid_arg(self):
             unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2))
             unfold(torch.randn(1, 2, 2, 2))
 
+        with self.assertRaisesRegex(RuntimeError, r"the product of kernel_width and kernel_height overflowed"):
+            tensor_data = torch.tensor([
+                [1.4009e-03, -1.3341e-32, -1.3334e-32, -1.3341e-32, 1.2723e-38, 3.6334e+00, 1.5374e-02],
+                [-1.5525e-02, 9.2391e-29, -2.5615e-13, -1.3322e-32, -1.3341e-32, -1.3341e-32, -1.3341e-32],
+                [-1.3341e-32, -1.3341e-32, -1.3341e-32, 3.0466e+14, 2.3677e+14, 2.3677e+14, 2.3677e+14],
+            ])
+            F.fold(tensor_data, 16, 7318349394477056)
+
     def test_softmin(self):
         x = torch.randn(2, 16)
         self.assertEqual(F.softmin(x, 1), F.softmax(-x, 1))
diff --git a/test/test_opaque_obj.py b/test/test_opaque_obj.py
new file mode 100644
index 0000000000000..c3edbbf7695cf
--- /dev/null
+++ b/test/test_opaque_obj.py
@@ -0,0 +1,88 @@
+# Owner(s): ["module: custom-operators"]
+
+import torch
+from torch._dynamo.test_case import run_tests, TestCase
+from torch._library.opaque_object import get_payload, make_opaque, set_payload
+
+
+class OpaqueQueue:
+    def __init__(self, queue: list[torch.Tensor], init_tensor_: torch.Tensor) -> None:
+        super().__init__()
+        self.queue = queue
+        self.init_tensor_ = init_tensor_
+
+    def push(self, tensor: torch.Tensor) -> None:
+        self.queue.append(tensor)
+
+    def pop(self) -> torch.Tensor:
+        if len(self.queue) > 0:
+            return self.queue.pop(0)
+        return self.init_tensor_
+
+    def size(self) -> int:
+        return len(self.queue)
+
+
+class TestOpaqueObject(TestCase):
+    def setUp(self):
+        self.lib = torch.library.Library("_TestOpaqueObject", "FRAGMENT")  # noqa: TOR901
+
+        torch.library.define(
+            "_TestOpaqueObject::queue_push",
+            "(__torch__.torch.classes.aten.OpaqueObject a, Tensor b) -> ()",
+            tags=torch.Tag.pt2_compliant_tag,
+            lib=self.lib,
+        )
+
+        @torch.library.impl(
+            "_TestOpaqueObject::queue_push", "CompositeExplicitAutograd", lib=self.lib
+        )
+        def push_impl(q: torch._C.ScriptObject, b: torch.Tensor) -> None:
+            queue = get_payload(q)
+            assert isinstance(queue, OpaqueQueue)
+            queue.push(b)
+
+        self.lib.define(
+            "queue_pop(__torch__.torch.classes.aten.OpaqueObject a) -> Tensor",
+        )
+
+        def pop_impl(q: torch._C.ScriptObject) -> torch.Tensor:
+            queue = get_payload(q)
+            assert isinstance(queue, OpaqueQueue)
+            return queue.pop()
+
+        self.lib.impl("queue_pop", pop_impl, "CompositeExplicitAutograd")
+
+        super().setUp()
+
+    def tearDown(self):
+        self.lib._destroy()
+
+        super().tearDown()
+
+    def test_creation(self):
+        queue = OpaqueQueue([], torch.zeros(3))
+        obj = make_opaque(queue)
+        self.assertTrue(isinstance(obj, torch._C.ScriptObject))
+        self.assertEqual(str(obj._type()), "__torch__.torch.classes.aten.OpaqueObject")
+
+        # obj.payload stores a direct reference to this python queue object
+        payload = get_payload(obj)
+        self.assertEqual(payload, queue)
+        queue.push(torch.ones(3))
+        self.assertEqual(payload.size(), 1)
+
+    def test_ops(self):
+        queue = OpaqueQueue([], torch.zeros(3))
+        obj = make_opaque()
+        set_payload(obj, queue)
+
+        torch.ops._TestOpaqueObject.queue_push(obj, torch.ones(3) + 1)
+        self.assertEqual(queue.size(), 1)
+        popped = torch.ops._TestOpaqueObject.queue_pop(obj)
+        self.assertEqual(popped, torch.ones(3) + 1)
+        self.assertEqual(queue.size(), 0)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_openreg.py b/test/test_openreg.py
deleted file mode 100644
index 7ee8ccefcd093..0000000000000
--- a/test/test_openreg.py
+++ /dev/null
@@ -1,630 +0,0 @@
-# Owner(s): ["module: PrivateUse1"]
-
-import _codecs
-import io
-import os
-import tempfile
-import types
-import unittest
-from unittest.mock import patch
-
-import numpy as np
-import psutil
-import torch_openreg  # noqa: F401
-
-import torch
-from torch.serialization import safe_globals
-from torch.testing._internal.common_utils import (
-    run_tests,
-    skipIfMPS,
-    skipIfTorchDynamo,
-    skipIfWindows,
-    skipIfXpu,
-    TemporaryFileName,
-    TestCase,
-)
-
-
-class TestPrivateUse1(TestCase):
-    """Tests of third-parth device integration mechinasm based PrivateUse1"""
-
-    def test_backend_name(self):
-        self.assertEqual(torch._C._get_privateuse1_backend_name(), "openreg")
-        # backend can be renamed to the same name multiple times
-        torch.utils.rename_privateuse1_backend("openreg")
-        with self.assertRaisesRegex(RuntimeError, "has already been set"):  # type: ignore[misc]
-            torch.utils.rename_privateuse1_backend("dev")
-
-    def test_backend_module_registration(self):
-        def generate_faked_module():
-            return types.ModuleType("fake_module")
-
-        with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"):  # type: ignore[misc]
-            torch._register_device_module("dev", generate_faked_module())
-        with self.assertRaisesRegex(RuntimeError, "The runtime module of"):  # type: ignore[misc]
-            torch._register_device_module("openreg", generate_faked_module())
-
-    def test_backend_generate_methods(self):
-        with self.assertRaisesRegex(RuntimeError, "The custom device module of"):  # type: ignore[misc]
-            torch.utils.generate_methods_for_privateuse1_backend()  # type: ignore[misc]
-
-        self.assertTrue(hasattr(torch.Tensor, "is_openreg"))
-        self.assertTrue(hasattr(torch.Tensor, "openreg"))
-        self.assertTrue(hasattr(torch.TypedStorage, "is_openreg"))
-        self.assertTrue(hasattr(torch.TypedStorage, "openreg"))
-        self.assertTrue(hasattr(torch.UntypedStorage, "is_openreg"))
-        self.assertTrue(hasattr(torch.UntypedStorage, "openreg"))
-        self.assertTrue(hasattr(torch.nn.Module, "openreg"))
-        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "is_openreg"))
-        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "openreg"))
-
-    def test_backend_module_function(self):
-        with self.assertRaisesRegex(RuntimeError, "Try to call torch.openreg"):  # type: ignore[misc]
-            torch.utils.backend_registration._get_custom_mod_func("func_name_")  # type: ignore[misc]
-        self.assertTrue(
-            torch.utils.backend_registration._get_custom_mod_func("device_count")() == 2  # type: ignore[misc]
-        )
-
-    @skipIfTorchDynamo()
-    def test_backend_operator_registration(self):
-        self.assertTrue(
-            torch._C._dispatch_has_kernel_for_dispatch_key(
-                "aten::empty.memory_format", torch.DispatchKey.PrivateUse1
-            )
-        )
-        x = torch.empty(3, 3, device="openreg")
-        self.assertTrue(x.device.type, "openreg")
-        self.assertTrue(x.shape, torch.Size([3, 3]))
-
-    def test_backend_dispatchstub(self):
-        x_cpu = torch.randn(2, 2, 3, dtype=torch.float32, device="cpu")
-        x_openreg = x_cpu.to("openreg")
-
-        y_cpu = torch.abs(x_cpu)
-        y_openreg = torch.abs(x_openreg)
-        self.assertEqual(y_cpu, y_openreg.cpu())
-
-        o_cpu = torch.randn(2, 2, 6, dtype=torch.float32, device="cpu")
-        o_openreg = o_cpu.to("openreg")
-        # output operand with resize flag is False in TensorIterator.
-        torch.abs(x_cpu, out=o_cpu[:, :, 0:6:2])
-        torch.abs(x_openreg, out=o_openreg[:, :, 0:6:2])
-        self.assertEqual(o_cpu, o_openreg.cpu())
-
-        # output operand with resize flag is True in TensorIterator and
-        # convert output to contiguous tensor in TensorIterator.
-        torch.abs(x_cpu, out=o_cpu[:, :, 0:6:3])
-        torch.abs(x_openreg, out=o_openreg[:, :, 0:6:3])
-        self.assertEqual(o_cpu, o_openreg.cpu())
-
-    def test_backend_tensor_type(self):
-        dtypes_map = {
-            torch.bool: "torch.openreg.BoolTensor",
-            torch.double: "torch.openreg.DoubleTensor",
-            torch.float32: "torch.openreg.FloatTensor",
-            torch.half: "torch.openreg.HalfTensor",
-            torch.int32: "torch.openreg.IntTensor",
-            torch.int64: "torch.openreg.LongTensor",
-            torch.int8: "torch.openreg.CharTensor",
-            torch.short: "torch.openreg.ShortTensor",
-            torch.uint8: "torch.openreg.ByteTensor",
-        }
-
-        for dtype, str in dtypes_map.items():
-            x = torch.empty(4, 4, dtype=dtype, device="openreg")
-            self.assertTrue(x.type() == str)
-
-    # Note that all dtype-d Tensor objects here are only for legacy reasons
-    # and should NOT be used.
-    def test_backend_type_methods(self):
-        # Tensor
-        tensor_cpu = torch.randn([8]).float()
-        self.assertEqual(tensor_cpu.type(), "torch.FloatTensor")
-
-        tensor_openreg = tensor_cpu.openreg()
-        self.assertEqual(tensor_openreg.type(), "torch.openreg.FloatTensor")
-
-        # Storage
-        storage_cpu = tensor_cpu.storage()
-        self.assertEqual(storage_cpu.type(), "torch.FloatStorage")
-
-        tensor_openreg = tensor_cpu.openreg()
-        storage_openreg = tensor_openreg.storage()
-        self.assertEqual(storage_openreg.type(), "torch.storage.TypedStorage")
-
-        class CustomFloatStorage:
-            @property
-            def __module__(self):
-                return "torch." + torch._C._get_privateuse1_backend_name()
-
-            @property
-            def __name__(self):
-                return "FloatStorage"
-
-        try:
-            torch.openreg.FloatStorage = CustomFloatStorage()
-            self.assertEqual(storage_openreg.type(), "torch.openreg.FloatStorage")
-
-            # test custom int storage after defining FloatStorage
-            tensor_openreg = tensor_cpu.int().openreg()
-            storage_openreg = tensor_openreg.storage()
-            self.assertEqual(storage_openreg.type(), "torch.storage.TypedStorage")
-        finally:
-            torch.openreg.FloatStorage = None
-
-    def test_backend_tensor_methods(self):
-        x = torch.empty(4, 4)
-        self.assertFalse(x.is_openreg)  # type: ignore[misc]
-
-        y = x.openreg(torch.device("openreg"))  # type: ignore[misc]
-        self.assertTrue(y.is_openreg)  # type: ignore[misc]
-        z = x.openreg(torch.device("openreg:0"))  # type: ignore[misc]
-        self.assertTrue(z.is_openreg)  # type: ignore[misc]
-        n = x.openreg(0)  # type: ignore[misc]
-        self.assertTrue(n.is_openreg)  # type: ignore[misc]
-
-    @unittest.skip("Need to support Parameter in openreg")
-    def test_backend_module_methods(self):
-        class FakeModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.x = torch.nn.Parameter(torch.randn(3, 3))
-
-            def forward(self):
-                pass
-
-        module = FakeModule()
-        self.assertEqual(module.x.device.type, "cpu")
-        module.openreg()  # type: ignore[misc]
-        self.assertEqual(module.x.device.type, "openreg")
-
-    @unittest.skip("Need to support untyped_storage in openreg")
-    def test_backend_storage_methods(self):
-        x = torch.empty(4, 4)
-
-        x_cpu = x.storage()
-        self.assertFalse(x_cpu.is_openreg)  # type: ignore[misc]
-        x_openreg = x_cpu.openreg()  # type: ignore[misc]
-        self.assertTrue(x_openreg.is_openreg)  # type: ignore[misc]
-
-        y = torch.empty(4, 4)
-
-        y_cpu = y.untyped_storage()
-        self.assertFalse(y_cpu.is_openreg)  # type: ignore[misc]
-        y_openreg = y_cpu.openreg()  # type: ignore[misc]
-        self.assertTrue(y_openreg.is_openreg)  # type: ignore[misc]
-
-    def test_backend_packed_sequence_methods(self):
-        x = torch.rand(5, 3)
-        y = torch.tensor([1, 1, 1, 1, 1])
-
-        z_cpu = torch.nn.utils.rnn.PackedSequence(x, y)
-        self.assertFalse(z_cpu.is_openreg)  # type: ignore[misc]
-
-        z_openreg = z_cpu.openreg()  # type: ignore[misc]
-        self.assertTrue(z_openreg.is_openreg)  # type: ignore[misc]
-
-
-class TestOpenReg(TestCase):
-    """Tests of mimic accelerator named OpenReg based on PrivateUse1"""
-
-    # Stream & Event
-    def test_stream_synchronize(self):
-        stream = torch.Stream(device="openreg:1")
-        stream.synchronize()
-        self.assertEqual(True, stream.query())
-
-    def test_stream_wait_stream(self):
-        stream_1 = torch.Stream(device="openreg:0")
-        stream_2 = torch.Stream(device="openreg:1")
-        # Does not crash!
-        stream_2.wait_stream(stream_1)
-
-    @skipIfTorchDynamo()
-    def test_record_event(self):
-        stream = torch.Stream(device="openreg:1")
-        event1 = stream.record_event()
-        self.assertNotEqual(0, event1.event_id)
-        event2 = stream.record_event()
-        self.assertNotEqual(0, event2.event_id)
-        self.assertNotEqual(event1.event_id, event2.event_id)
-
-    @skipIfTorchDynamo()
-    def test_event_elapsed_time(self):
-        stream = torch.Stream(device="openreg:1")
-        e1 = torch.Event(device="openreg:1", enable_timing=True)
-        e1.record(stream)
-        e2 = torch.Event(device="openreg:1", enable_timing=True)
-        e2.record(stream)
-
-        e2.synchronize()
-        self.assertTrue(e2.query())
-
-        ms = e1.elapsed_time(e2)
-        self.assertTrue(ms > 0)
-
-    @skipIfTorchDynamo()
-    def test_stream_wait_event(self):
-        s1 = torch.Stream(device="openreg")
-        s2 = torch.Stream(device="openreg")
-        e = s1.record_event()
-        s2.wait_event(e)
-
-    @skipIfTorchDynamo()
-    def test_event_wait_stream(self):
-        s1 = torch.Stream(device="openreg")
-        s2 = torch.Stream(device="openreg")
-        e1 = s1.record_event()
-        e1.wait(s2)
-
-    # Copy
-    def test_cross_device_copy(self):
-        a = torch.rand(10)
-        b = a.to(device="openreg").add(2).to(device="cpu")
-        self.assertEqual(b, a + 2)
-
-    def test_copy_same_device(self):
-        a = torch.ones(10, device="openreg").clone()
-        self.assertEqual(a, torch.ones(10, device="openreg"))
-
-    def test_cross_diff_devices_copy(self):
-        a = torch.ones(10, device="openreg:0").to(device="openreg:1").to(device="cpu")
-        self.assertEqual(a, torch.ones(10))
-
-    # RNG
-    def test_generator(self):
-        generator = torch.Generator(device="openreg:1")
-        self.assertEqual(generator.device.type, "openreg")
-        self.assertEqual(generator.device.index, 1)
-
-    def test_rng_state(self):
-        state = torch.openreg.get_rng_state(0)  # type: ignore[misc]
-        torch.openreg.set_rng_state(state, 0)  # type: ignore[misc]
-
-    def test_manual_seed(self):
-        torch.openreg.manual_seed_all(2024)  # type: ignore[misc]
-        self.assertEqual(torch.openreg.initial_seed(), 2024)  # type: ignore[misc]
-
-    # Autograd
-    @skipIfMPS
-    @skipIfWindows()
-    def test_autograd_init(self):
-        # Make sure autograd is initialized
-        torch.ones(2, requires_grad=True, device="openreg").sum().backward()
-
-        pid = os.getpid()
-        task_path = f"/proc/{pid}/task"
-        all_threads = psutil.Process(pid).threads()
-
-        all_thread_names = set()
-
-        for t in all_threads:
-            with open(f"{task_path}/{t.id}/comm") as file:
-                thread_name = file.read().strip()
-            all_thread_names.add(thread_name)
-
-        for i in range(torch.accelerator.device_count()):
-            self.assertIn(f"pt_autograd_{i}", all_thread_names)
-
-    # Storage & Pin Memory
-    @skipIfTorchDynamo("unsupported aten.is_pinned.default")
-    def test_pin_memory(self):
-        tensor = torch.randn(10)
-        self.assertFalse(tensor.is_pinned())
-        pinned_tensor = tensor.pin_memory()
-        self.assertTrue(pinned_tensor.is_pinned())
-        slice_tensor = pinned_tensor[2:5]
-        self.assertTrue(slice_tensor.is_pinned())
-
-        tensor = torch.randn(10)
-        storage = tensor.storage()
-        self.assertFalse(storage.is_pinned("openreg"))
-        pinned_storage = storage.pin_memory("openreg")
-        self.assertTrue(pinned_storage.is_pinned("openreg"))
-
-        tensor = torch.randn(10)
-        untyped_storage = tensor.untyped_storage()
-        self.assertFalse(untyped_storage.is_pinned("openreg"))
-        pinned_untyped_storage = untyped_storage.pin_memory("openreg")
-        self.assertTrue(pinned_untyped_storage.is_pinned("openreg"))
-
-    @skipIfTorchDynamo("unsupported aten.is_pinned.default")
-    def test_rewrapped_storage(self):
-        pinned_a = torch.randn(10).pin_memory()
-        rewrapped_a = torch.tensor((), dtype=torch.float32).set_(
-            pinned_a.untyped_storage()[2:],
-            size=(5,),
-            stride=(1,),
-            storage_offset=0,
-        )
-        self.assertTrue(rewrapped_a.is_pinned())
-        self.assertNotEqual(pinned_a.data_ptr(), rewrapped_a.data_ptr())
-
-    # Serialization
-    def test_serialization(self):
-        storage = torch.UntypedStorage(4, device=torch.device("openreg"))
-        self.assertEqual(torch.serialization.location_tag(storage), "openreg:0")
-
-        storage = torch.UntypedStorage(4, device=torch.device("openreg:0"))
-        self.assertEqual(torch.serialization.location_tag(storage), "openreg:0")
-
-        storage_cpu = torch.empty(4, 4).storage()
-        storage_openreg = torch.serialization.default_restore_location(
-            storage_cpu, "openreg:0"
-        )
-        self.assertTrue(storage_openreg.is_openreg)  # type: ignore[misc]
-
-        tensor = torch.empty(3, 3, device="openreg")
-        self.assertEqual(torch._utils.get_tensor_metadata(tensor), {})  # type: ignore[misc]
-        metadata = {"version_number": True, "format_number": True}
-        torch._utils.set_tensor_metadata(tensor, metadata)  # type: ignore[misc]
-        self.assertEqual(torch._utils.get_tensor_metadata(tensor), metadata)  # type: ignore[misc]
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            path = os.path.join(tmpdir, "data.pt")
-            torch.save(tensor, path)
-
-            tensor_openreg = torch.load(path)
-            self.assertTrue(tensor_openreg.is_openreg)
-            self.assertEqual(torch._utils.get_tensor_metadata(tensor_openreg), metadata)  # type: ignore[misc]
-
-            tensor_cpu = torch.load(path, map_location="cpu")
-            self.assertFalse(tensor_cpu.is_openreg)
-            self.assertEqual(torch._utils.get_tensor_metadata(tensor_cpu), {})  # type: ignore[misc]
-
-    @skipIfTorchDynamo()
-    @unittest.skipIf(
-        np.__version__ < "1.25",
-        "versions < 1.25 serialize dtypes differently from how it's serialized in data_legacy_numpy",
-    )
-    def test_open_device_numpy_serialization(self):
-        """
-        This tests the legacy _rebuild_device_tensor_from_numpy serialization path
-        """
-        data_legacy_numpy = (
-            b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-            b"\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02}q\x00X\x01"
-            b"\x00\x00\x00xq\x01ctorch._utils\n_rebuild_device_tensor_from_numpy\nq\x02(cnumpy.core.m"
-            b"ultiarray\n_reconstruct\nq\x03cnumpy\nndarray\nq\x04K\x00\x85q\x05c_codecs\nencode\nq\x06"
-            b"X\x01\x00\x00\x00bq\x07X\x06\x00\x00\x00latin1q\x08\x86q\tRq\n\x87q\x0bRq\x0c(K\x01K\x02K"
-            b"\x03\x86q\rcnumpy\ndtype\nq\x0eX\x02\x00\x00\x00f4q\x0f\x89\x88\x87q\x10Rq\x11(K\x03X\x01"
-            b"\x00\x00\x00<q\x12NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00tq\x13b\x89h\x06X\x1c\x00\x00"
-            b"\x00\x00\x00\xc2\x80?\x00\x00\x00@\x00\x00@@\x00\x00\xc2\x80@\x00\x00\xc2\xa0@\x00\x00\xc3"
-            b"\x80@q\x14h\x08\x86q\x15Rq\x16tq\x17bctorch\nfloat32\nq\x18X\t\x00\x00\x00openreg:0q\x19\x89"
-            b"tq\x1aRq\x1bs.PK\x07\x08\xdfE\xd6\xcaS\x01\x00\x00S\x01\x00\x00PK\x03\x04\x00\x00\x08"
-            b"\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00.\x00"
-            b"archive/byteorderFB*\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08"
-            b"\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00"
-            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00=\x00archive/versionFB9\x00"
-            b"ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00"
-            b"\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-            b"\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZ"
-            b"ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0636457737946401051300000025273995036293PK\x07\x08\xee(\xcd"
-            b"\x8d(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00"
-            b"\xdfE\xd6\xcaS\x01\x00\x00S\x01\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-            b"\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00"
-            b"\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00"
-            b"\x00\x00\x00\x00\x00\xa3\x01\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00"
-            b"\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00"
-            b"\x00\x00\x00\x00\x00\x00\x00\x16\x02\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08"
-            b"\x08\x00\x00\x00\x00\x00\x00\xee(\xcd\x8d(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00"
-            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x92\x02\x00\x00archive/.data/serialization_idPK\x06"
-            b"\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00"
-            b"\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x06\x01\x00\x00\x00\x00\x00\x008\x03\x00"
-            b"\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00>\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00"
-            b"PK\x05\x06\x00\x00\x00\x00\x04\x00\x04\x00\x06\x01\x00\x008\x03\x00\x00\x00\x00"
-        )
-        buf_data_legacy_numpy = io.BytesIO(data_legacy_numpy)
-
-        with safe_globals(
-            [
-                (
-                    (
-                        np.core.multiarray._reconstruct,
-                        "numpy.core.multiarray._reconstruct",
-                    )
-                    if np.__version__ >= "2.1"
-                    else np.core.multiarray._reconstruct
-                ),
-                np.ndarray,
-                np.dtype,
-                _codecs.encode,
-                np.dtypes.Float32DType,
-            ]
-        ):
-            sd_loaded = torch.load(buf_data_legacy_numpy, weights_only=True)
-            buf_data_legacy_numpy.seek(0)
-            # Test map_location
-            sd_loaded_cpu = torch.load(
-                buf_data_legacy_numpy, weights_only=True, map_location="cpu"
-            )
-
-        expected = torch.tensor(
-            [[1, 2, 3], [4, 5, 6]], dtype=torch.float32, device="openreg"
-        )
-        self.assertEqual(sd_loaded["x"].cpu(), expected.cpu())
-        self.assertFalse(sd_loaded["x"].is_cpu)
-        self.assertTrue(sd_loaded_cpu["x"].is_cpu)
-
-    def test_open_device_cpu_serialization(self):
-        default_protocol = torch.serialization.DEFAULT_PROTOCOL
-
-        with patch.object(torch._C, "_has_storage", return_value=False):
-            x = torch.randn(2, 3)
-            x_openreg = x.to("openreg")
-            sd = {"x": x_openreg}
-            rebuild_func = x_openreg._reduce_ex_internal(default_protocol)[0]
-            self.assertTrue(
-                rebuild_func is torch._utils._rebuild_device_tensor_from_cpu_tensor
-            )
-
-            # Test map_location
-            with TemporaryFileName() as f:
-                torch.save(sd, f)
-                sd_loaded = torch.load(f, weights_only=True)
-                # Test map_location
-                sd_loaded_cpu = torch.load(f, weights_only=True, map_location="cpu")
-            self.assertFalse(sd_loaded["x"].is_cpu)
-            self.assertEqual(sd_loaded["x"].cpu(), x)
-            self.assertTrue(sd_loaded_cpu["x"].is_cpu)
-
-            # Test metadata_only
-            with TemporaryFileName() as f:
-                with self.assertRaisesRegex(
-                    RuntimeError,
-                    "Cannot serialize tensors on backends with no storage under skip_data context manager",
-                ):
-                    with torch.serialization.skip_data():
-                        torch.save(sd, f)
-
-    # Operators
-    def test_factory(self):
-        x = torch.empty(3, device="openreg")
-        self.assertEqual(x.device.type, "openreg")
-        self.assertEqual(x.shape, torch.Size([3]))
-
-        y = torch.zeros(3, device="openreg")
-        self.assertEqual(y.device.type, "openreg")
-        self.assertEqual(y.shape, torch.Size([3]))
-
-        z = torch.tensor((), device="openreg")
-        self.assertEqual(z.device.type, "openreg")
-        self.assertEqual(z.shape, torch.Size([0]))
-
-    def test_fake_tensor(self):
-        with torch._subclasses.fake_tensor.FakeTensorMode():
-            a = torch.empty(1, device="openreg")
-            b = torch.empty(1, device="openreg:0")
-            result = a + b  # noqa: F841
-
-    def test_named_tensor(self):
-        return torch.empty([2, 3, 4, 5], device="openreg", names=["N", "C", "H", "W"])
-
-    def test_printing(self):
-        a = torch.ones(20, device="openreg")
-        # Does not crash!
-        str(a)
-
-    def test_data_dependent_output(self):
-        cpu_a = torch.randn(10)
-        a = cpu_a.to(device="openreg")
-        mask = a.gt(0)
-        out = torch.masked_select(a, mask)
-
-        self.assertEqual(out, cpu_a.masked_select(cpu_a.gt(0)))
-
-    def test_expand(self):
-        x = torch.tensor([[1], [2], [3]], device="openreg")
-        y = x.expand(3, 2)
-        self.assertEqual(y.to(device="cpu"), torch.tensor([[1, 1], [2, 2], [3, 3]]))
-        self.assertEqual(x.data_ptr(), y.data_ptr())
-
-    def test_resize(self):
-        tensor_cpu = torch.randn([4, 4])
-
-        tensor_openreg = tensor_cpu.openreg()
-        self.assertTrue(tensor_openreg.size() == torch.Size([4, 4]))
-
-        storage_openreg = tensor_openreg.storage()
-        self.assertTrue(storage_openreg.size() == 16)
-
-        tensor_openreg.resize_(2, 2, 2, 2)
-        self.assertTrue(tensor_openreg.size() == torch.Size([2, 2, 2, 2]))
-
-        storage_openreg = tensor_openreg.storage()
-        self.assertTrue(storage_openreg.size() == 16)
-
-    # Quantize
-    @skipIfXpu(msg="missing kernel for openreg")
-    def test_quantize(self):
-        x = torch.randn(3, 4, 5, dtype=torch.float32, device="openreg")
-        quantized_tensor = torch.quantize_per_tensor(x, 0.1, 10, torch.qint8)
-        self.assertEqual(quantized_tensor.device, torch.device("openreg:0"))
-        self.assertEqual(quantized_tensor.dtype, torch.qint8)
-
-    # custom autograd
-    def test_compile_autograd_function_returns_self(self):
-        in_ref = torch.randn(4, device="openreg", requires_grad=True)
-        out_ref = torch.ops.openreg.custom_autograd_fn_returns_self(in_ref)
-        out_ref.sum().backward()
-
-        in_test = in_ref.detach().clone().requires_grad_(True)
-        # TODO(FFFrog): Need to support inductor for OpenReg first.
-        out_test = torch.compile(backend="aot_eager")(
-            torch.ops.openreg.custom_autograd_fn_returns_self
-        )(in_test)
-        out_test.sum().backward()
-
-        self.assertEqual(out_ref, out_test)
-        self.assertEqual(in_ref.grad, in_test.grad)
-
-    @skipIfTorchDynamo("Temporary disabled due to torch._ops.OpOverloadPacket")
-    def test_compile_autograd_function_aliasing(self):
-        in_ref = torch.randn(4, device="openreg", requires_grad=True)
-        out_ref = torch.ops.openreg.custom_autograd_fn_aliasing(in_ref)
-        out_ref.sum().backward()
-
-        in_test = in_ref.detach().clone().requires_grad_(True)
-        # TODO(FFFrog): Need to support inductor for OpenReg first.
-        out_test = torch.compile(backend="aot_eager")(
-            torch.ops.openreg.custom_autograd_fn_aliasing
-        )(in_test)
-        out_test.sum().backward()
-
-        self.assertEqual(out_ref, out_test)
-        self.assertEqual(in_ref.grad, in_test.grad)
-
-    def test_open_device_dlpack(self):
-        x_in = torch.randn(2, 3).to("openreg")
-        capsule = torch.utils.dlpack.to_dlpack(x_in)
-        x_out = torch.from_dlpack(capsule)
-        self.assertTrue(x_out.device == x_in.device)
-
-        x_in = x_in.to("cpu")
-        x_out = x_out.to("cpu")
-        self.assertEqual(x_in, x_out)
-
-    # fallback
-    def test_scalar_type_fallback(self):
-        x_cpu = torch.Tensor([[0, 0, 0, 1, 1, 2], [0, 1, 2, 1, 2, 2]]).to(torch.int64)
-        x = torch.triu_indices(3, 3, device="openreg")
-        self.assertEqual(x_cpu, x)
-
-    def test_tensor_type_fallback(self):
-        x = torch.Tensor([[1, 2, 3], [2, 3, 4]]).to("openreg")
-        y = torch.Tensor([1, 0, 2]).to("openreg")
-        self.assertTrue(x.device.type, "openreg")
-        self.assertFalse(x.is_cpu)
-
-        z_cpu = torch.Tensor([[0, 2, 1], [1, 3, 2]])
-        # call sub op, which will fallback to cpu
-        z = torch.sub(x, y)
-        self.assertEqual(z_cpu, z)
-
-        # call index op, which will fallback to cpu
-        z_cpu = torch.Tensor([3, 1])
-        y = torch.Tensor([1, 0]).long().to("openreg")
-        z = x[y, y]
-        self.assertEqual(z_cpu, z)
-
-    def test_tensorlist_type_fallback(self):
-        # create tensors located in custom device
-        v_openreg = torch.Tensor([1, 2, 3]).to("openreg")
-        # create result tensor located in cpu
-        z_cpu = torch.Tensor([2, 4, 6])
-        # create tensorlist for foreach_add op
-        x = (v_openreg, v_openreg)
-        y = (v_openreg, v_openreg)
-
-        # Check that our device is correct.
-        self.assertTrue(v_openreg.device.type == "openreg")
-        self.assertFalse(v_openreg.is_cpu)
-
-        # call _foreach_add op, which will fallback to cpu
-        z = torch._foreach_add(x, y)
-        self.assertEqual(z_cpu, z[0])
-        self.assertEqual(z_cpu, z[1])
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/test_ops.py b/test/test_ops.py
index b435260e3beef..7956a6cc3bcf6 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -190,7 +190,6 @@ def reduction_dtype_filter(op):
     xfail("tril"),
     xfail("triu"),
     xfail("unfold_copy"),
-    xfail("where"),
     # Output has dynamic shape.
     # Does not have a meta kernel implementation.
     skip("linalg.lstsq"),
diff --git a/test/test_optim.py b/test/test_optim.py
index 6dd23d6328c89..de185725b5c2c 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -2305,6 +2305,34 @@ def test_non_empty_state(self, device, dtype, optim_info):
             for state in optim.state.values():
                 self.assertGreater(len(state), 0)
 
+    @parametrize("dtype", [torch.float32])
+    def test_step_iteration(self, device, dtype):
+        def _get_model_and_input_tensor(device, dtype):
+            model = torch.nn.Sequential(
+                torch.nn.Conv2d(4, 2, 1, stride=2),
+                torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
+            )
+            input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+            model.to(dtype=dtype, device=device)
+            return model, input
+
+        counter = 0
+
+        def fwd_bwd(optim, mod, i):
+            nonlocal counter
+            counter += 1
+            optim.zero_grad()
+            loss = mod(i).sum()
+            loss.backward()
+            return loss
+
+        model, input = _get_model_and_input_tensor(device, dtype)
+        optimizer = torch.optim.LBFGS(
+            model.parameters(), max_iter=1, max_eval=5, line_search_fn="strong_wolfe"
+        )
+        optimizer.step(functools.partial(fwd_bwd, optimizer, model, input))
+        self.assertEqual(counter, 6)
+
 
 instantiate_device_type_tests(TestOptimRenewed, globals(), allow_mps=True)
 
diff --git a/test/test_prims.py b/test/test_prims.py
index f0fb606d1c5b3..e528a1eb2e4eb 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -142,7 +142,7 @@ def test_collapse(self, device, dtype):
             self.assertTrue(view._is_view())
 
         t_discontig = t.transpose(0, 1)
-        with self.assertRaises(ValueError, msg="no such view exists"):
+        with self.assertRaises(RuntimeError, msg="Attempting to view a collapsed tensor, but no such view exists!"):
             view = prims.collapse_view(t_discontig, 0, 2)
 
         copy = prims.collapse(t_discontig, 0, 1)
@@ -342,6 +342,16 @@ def test_clone_complex(self):
             x = torch.randn(4, dtype=torch.complex64, device='meta').conj()
             x + 1
 
+    def test_clone_meta_stride_preservation_dense(self):
+        tensor = torch.randn(1, 5).t()
+        meta_clone = prims._clone_meta(tensor, memory_format=torch.preserve_format)
+        self.assertEqual(tensor.stride(), meta_clone.stride())
+
+    def test_clone_meta_stride_preservation_sparse(self):
+        tensor = torch.arange(12).float().view(3, 4)[1:, ::2]
+        meta_clone = prims._clone_meta(tensor, memory_format=torch.preserve_format)
+        self.assertEqual(tensor.contiguous().stride(), meta_clone.stride())
+
     def test_check_deprecation_warning(self):
         with self.assertWarnsRegex(FutureWarning, 'will be removed in the future'):
             torch._prims_common.check(True, lambda: 'message')
diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
index d2a0e8bd1ccca..ba967c142f1e7 100644
--- a/test/test_scatter_gather_ops.py
+++ b/test/test_scatter_gather_ops.py
@@ -383,13 +383,14 @@ def helper(input_size, idx_size):
     @dtypes(torch.float32)
     def test_scatter_add_broadcasted_index_deterministic(self, device, dtype):
         for d in (0, 1):
-            inp = torch.randn(3, 4, device=device, dtype=dtype)
+            inp = torch.randn(3, 4, 5, device=device, dtype=dtype)
             idx_1d = torch.randint(3, (10,), device=device)
             src_shape = list(inp.shape)
             src_shape[d] = 10
             src = torch.randn(src_shape, device=device, dtype=dtype)
-            idx = idx_1d.unsqueeze(1 - d).expand(src_shape)
-            print(idx.stride())
+            idx_view_shape = [1] * inp.ndim
+            idx_view_shape[d] = 10
+            idx = idx_1d.view(idx_view_shape).expand(src_shape)
             ref = inp.clone().scatter_add_(d, idx, src)
             with DeterministicGuard(True):
                 res = inp.clone().scatter_add_(d, idx, src)
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 82b535fd114ac..bef926fc7b664 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -64,7 +64,7 @@ def _op_supports_any_sparse(op):
 gradcheck = functools.partial(gradcheck, check_batched_grad=False)
 
 CUSPARSE_SPMM_COMPLEX128_SUPPORTED = (
-    IS_WINDOWS and torch.version.cuda and version.parse(torch.version.cuda) > version.parse("11.2")
+    IS_WINDOWS and torch.version.cuda
 ) or (not IS_WINDOWS and not TEST_WITH_ROCM)
 
 HIPSPARSE_SPMM_COMPLEX128_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("6.0")
@@ -485,8 +485,8 @@ def func(indices, values, shape, is_coalesced):
                                                 "cannot set is_coalesced to true if indices correspond to uncoalesced COO tensor"):
                         torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), shape, True))
 
-    @expectedFailureMPS
     @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16))
+    @dtypesIfMPS(*all_mps_types())
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
     def test_to_dense_with_gradcheck(self, device, dtype, gradcheck):
@@ -511,7 +511,8 @@ def fn(x):
             x.requires_grad_(True)
             gradcheck(fn, (x,))
 
-        for value_type in [torch.double, torch.cdouble]:
+        values_types = [torch.double, torch.cdouble] if device != "mps:0" else [torch.float32, torch.complex64]
+        for value_type in values_types:
             i = self.index_tensor([
                 [0, 1, 2, 2],
                 [0, 0, 0, 3],
@@ -865,8 +866,8 @@ def test_shape(sparse_dims, nnz, with_size):
         test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble, torch.bfloat16)
+    @dtypesIfMPS(torch.float32, torch.complex64, torch.bfloat16)
     @precisionOverride({torch.bfloat16: 2e-2})
     def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced):
         # This is for testing torch.copy_(SparseTensor, SparseTensor)
@@ -889,7 +890,7 @@ def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced):
         x1.copy_(x2)
         self.assertEqual(x1_dtype, x1.dtype)
 
-        x2 = x2.to(torch.float64)
+        x2 = x2.to(torch.float64) if device != "mps:0" else x2.to(torch.float32)
         x1_dtype = x1.dtype
         x1.copy_(x2)
         self.assertEqual(x1_dtype, x1.dtype)
@@ -1114,8 +1115,8 @@ def test_shape(sparse_dims, nnz, sizes):
         test_shape(2, 20, [3, 17, 19, 5])
         test_shape(2, 20, [3, 17, 19, 0])
 
-    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_add_sub_nnz(self, device, dtype):
         # nnz should not grow unbounded (gh-34964)
         x = torch.randn(10, dtype=dtype, device=device).to_sparse()
@@ -1127,9 +1128,9 @@ def test_add_sub_nnz(self, device, dtype):
         x.sub_(2 * x)
         self.assertLessEqual(x._nnz(), 10)
 
-    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_cat(self, device, dtype, coalesced):
         # shapes: list of tuples (sparse_dims, nnz, sizes)
         def test_shapes(shapes, dim, fail_message=None):
@@ -1170,9 +1171,9 @@ def test_shapes(shapes, dim, fail_message=None):
                                     "Concatenating sparse tensors, but a dense tensor was found at position 1."):
             torch.cat((sp, dn))
 
-    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_unsqueeze(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes, unsqueeze_dim, fail_message=None):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
@@ -1693,24 +1694,24 @@ def fn(S, D):
         test_shape(7, 8, 9, 20, True)
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
     def test_sparse_mul(self, device, dtype, coalesced, gradcheck):
         # https://github.com/pytorch/pytorch/issues/79914
         a = torch.tensor([[0., 1]], dtype=dtype, device=device).to_sparse().requires_grad_(True)
         b = torch.tensor([[0., 1]], dtype=dtype, device=device).to_sparse().requires_grad_(True)
-        gradcheck(lambda x, y: torch.sparse.sum(x * y).to_dense(masked_grad=gradcheck.masked), [a, b])
+        gradcheck(lambda x, y: torch.sparse.sum(x * y).to_dense(masked_grad=gradcheck.masked), [a, b], eps=1e-4)
 
         def test_shape(sparse_dims, nnz, with_shape):
             a = self._gen_sparse(sparse_dims, nnz, with_shape, dtype, device, coalesced)[0].requires_grad_(True)
             b = self._gen_sparse(sparse_dims, nnz, with_shape, dtype, device, coalesced)[0].requires_grad_(True)
 
             self.assertEqual((a * b).to_dense(), a.to_dense() * b.to_dense(), masked=True)
-            gradcheck(lambda x, y: (x * y).to_dense(), [a, b])
+            gradcheck(lambda x, y: (x * y).to_dense(), [a, b], eps=1e-4)
             # Issues with 0-dim indices/values
-            gradcheck(lambda x, y: torch.sparse.sum(x * y).to_dense(), [a, b], masked=True)
+            gradcheck(lambda x, y: torch.sparse.sum(x * y).to_dense(), [a, b], masked=True, eps=1e-4)
 
         # TODO: Re-enable these
         # test_shape(2, 3, [2, 3, 4, 5])
@@ -1939,7 +1940,6 @@ def fn(S):
             S = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)[0]
             run_tests(S.requires_grad_(True), test_dim)
 
-    @expectedFailureMPS
     def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, device, coalesced):
         shape = shape_i + (shape_v)
         x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape, dtype, device, coalesced)
@@ -2017,6 +2017,7 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, device,
 
     @coalescedonoff
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_basic_ops(self, device, dtype, coalesced):
 
         def _test_basic_ops():
@@ -2281,8 +2282,8 @@ def test_shape(i_shapes, v_shapes, shape, nnzs):
         test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 0], [9, 12])
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_zeros_like(self, device, dtype, coalesced):
         def _test_zeros_like(nnzs, template_shape_i, template_shape_v=None):
             template_shape_v = template_shape_v or []
@@ -2359,14 +2360,14 @@ def _test_empty_like(self, sparse_tensor, dtype, device, coalesced):
             self.assertTrue(result.layout == torch.strided)
 
         with self.assertRaisesRegex(
-            RuntimeError, r"Could not run 'aten::empty_strided' with arguments from the 'Sparse(CPU|CUDA)' backend"
+            RuntimeError, r"Could not run 'aten::empty_strided' with arguments from the 'Sparse(CPU|CUDA|MPS)' backend"
         ):
             dense_tensor = sparse_tensor.to_dense()
             result = torch.empty_like(dense_tensor, layout=torch.sparse_coo)
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_empty_like(self, device, dtype, coalesced):
         # tests https://github.com/pytorch/pytorch/issues/43699
 
@@ -2422,8 +2423,8 @@ def _all_narrow_combs(self, shape):
                     yield [dim, start, length]
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_narrow(self, device, dtype, coalesced):
         shape = [3, 3, 4, 2]
         input, _, _ = self._gen_sparse(4, 19, shape, dtype, device, coalesced)
@@ -3284,8 +3285,8 @@ def test_change_tensor_metadata(self, device, dtype):
         self.assertEqual(list(t.coalesce().values().size()), [1, 3])
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_pickle(self, device, dtype, coalesced):
         import pickle
 
@@ -3316,7 +3317,6 @@ def test_pickle(self, device, dtype, coalesced):
             sp_tensor_loaded = pickle.loads(serialized)
             self.assertEqual(sp_tensor, sp_tensor_loaded)
 
-    @expectedFailureMPS
     def test_any(self, device):
         t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, False]), device=device)
         t_any = torch.tensor(False)
@@ -3334,8 +3334,8 @@ def test_isnan(self, device):
         self.assertEqual(torch.isnan(t).int(), t_nan.int())
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.float32, torch.float64)
+    @dtypesIfMPS(torch.float16, torch.float32)
     def test_div_rounding_mode(self, device, dtype, coalesced):
         sparse, _, _ = self._gen_sparse(2, 10, (10, 10), dtype,
                                         device, coalesced)
@@ -3355,13 +3355,11 @@ def test_div_rounding_mode(self, device, dtype, coalesced):
             torch.div(sparse, -2, rounding_mode=mode, out=actual)
             self.assertEqual(self.safeToDense(actual), expect)
 
-    @expectedFailureMPS
     def test_div_by_sparse_error(self, device):
         self.assertRaisesRegex(RuntimeError, 'Sparse division requires',
                                lambda: torch.tensor(1., device=device).to_sparse()
                                / torch.tensor(1., device=device).to_sparse())
 
-    @expectedFailureMPS
     def test_floor_divide_by_sparse_error(self, device):
         self.assertRaisesRegex(RuntimeError, 'Sparse floor division requires',
                                lambda: torch.tensor(1., device=device).to_sparse()
@@ -3700,6 +3698,15 @@ def test_log_softmax_zero_nnz(self, device, dtype):
         self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 1, device, dtype)
         self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 10, device, dtype)
 
+    @dtypes(torch.float)
+    @expectedFailureMPS
+    def test_log_softmax_float(self, device, dtype):
+        x = (torch.rand(4, 3, dtype=dtype, device=device) - 10000000.0).to_sparse()
+        out = torch.sparse.log_softmax(x, dim=1).to_dense()
+        x_double = x.double()
+        out_double = torch.sparse.log_softmax(x_double, dim=1).to_dense()
+        self.assertEqual(out, out_double.to(dtype=dtype))
+
     # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
     @coalescedonoff
     @dtypes(*floating_and_complex_types())
@@ -4131,8 +4138,8 @@ def test_small_nnz_coalesced(self):
         self.assertFalse(torch.sparse_coo_tensor([[0, 1], [0, 1]], [1, 2], (2, 2)).is_coalesced())
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(*all_types_and_complex_and(torch.bool))
+    @dtypesIfMPS(*all_mps_types())
     def test_sum(self, device, dtype, coalesced):
         def run_test(shape, nnz):
             a = self._gen_sparse(2, nnz, shape, dtype, device, coalesced)[0]
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 48c6be3f7b928..f0e19407597d3 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -16,11 +16,11 @@
      skipIfRocmVersionLessThan, IS_FBCODE, IS_REMOTE_GPU, suppress_warnings)
 from torch.testing._internal.common_device_type import \
     (ops, instantiate_device_type_tests, dtypes, OpDTypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoSparseGeneric,
-     precisionOverride, skipMeta, skipCUDAIf, skipCPUIfNoMklSparse, skipCUDAIfRocmVersionLessThan,
+     precisionOverride, skipMeta, skipCUDAIf, skipCUDAIfRocm, skipCPUIfNoMklSparse, skipCUDAIfRocmVersionLessThan,
      largeTensorTest)
 from torch.testing._internal.common_methods_invocations import \
     (op_db, sparse_csr_unary_ufuncs, ReductionOpInfo)
-from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_CUDA
+from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_dtype import (
     floating_types, all_types_and_complex_and, floating_and_complex_types, floating_types_and,
     all_types_and_complex, floating_and_complex_types_and)
@@ -41,23 +41,11 @@
 
 no_mkl_sparse = IS_WINDOWS or not TEST_MKL
 
-def _check_cusparse_triangular_solve_available():
-    version = _get_torch_cuda_version()
-    # cusparseSpSM was added in 11.3.1 but we don't have access to patch version
-    min_supported_version = (11, 4)
-    return version >= min_supported_version
 
 def _check_cusparse_spgemm_available():
     # cusparseSpGEMM was added in 11.0
     return not TEST_WITH_ROCM
 
-def _check_cusparse_sddmm_available():
-    if TEST_WITH_ROCM:
-        return True
-    version = _get_torch_cuda_version()
-    # cusparseSDDMM was added in 11.2.1 but we don't have access to patch version
-    min_supported_version = (11, 3)
-    return version >= min_supported_version
 
 _sparse_csr_ops = list(filter(lambda op: op.supports_sparse_csr, op_db))
 _sparse_compressed_ops = list(filter(lambda op: (op.supports_sparse_csr or op.supports_sparse_csc
@@ -2345,10 +2333,7 @@ def run_test(index_type):
             run_test(index_dtype)
 
     @skipCPUIfNoMklSparse
-    @skipCUDAIf(
-        not _check_cusparse_triangular_solve_available(),
-        "cuSparse Generic API SpSV is not available"
-    )
+    @skipCUDAIfRocm(msg="needs HIPSPARSE_GENERIC_SPSV or SPSM")
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -2425,10 +2410,6 @@ def remove_diagonal(t):
                                                                                  itertools.product([True, False], repeat=4)):
             run_test(n, k, upper, unitriangular, transpose, zero)
 
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -2479,10 +2460,6 @@ def run_test(c, a, b, op_a, op_b, *, alpha=None, beta=None):
                 for op_a, op_b in itertools.product([True, False], repeat=2):
                     run_test(c, a, b, op_a, op_b)
 
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_sampled_addmm_autograd(self, device, dtype):
         from torch.testing._internal.common_methods_invocations import sample_inputs_sparse_sampled_addmm
@@ -2512,10 +2489,6 @@ def test_sampled_addmm_autograd(self, device, dtype):
     @onlyCUDA
     # It works on ROCm and CUDA issue is currently active
     @skipCUDAIf(not TEST_WITH_ROCM, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -2531,10 +2504,6 @@ def run_test(c, a, b):
             run_test(c, a, b)
 
     @onlyCUDA
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_sampled_addmm_errors(self, device, dtype):
         # test that the errors are the same for dense and sparse sampled versions
@@ -2814,10 +2783,6 @@ def fn(input):
             dense_output.backward(dense_covector)
             self.assertEqual(sparse_input.grad, dense_input.grad)
 
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
     @dtypes(torch.float64)
     def test_autograd_dense_output_addmm(self, device, dtype):
         from torch.testing._internal.common_methods_invocations import sample_inputs_addmm
diff --git a/test/test_sparse_semi_structured.py b/test/test_sparse_semi_structured.py
index 51fb4aa48c221..5374ec994cda2 100644
--- a/test/test_sparse_semi_structured.py
+++ b/test/test_sparse_semi_structured.py
@@ -50,8 +50,8 @@
 _IS_HIPSPARSELT_AVAILABLE = False
 
 if torch.cuda.is_available():
-    _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8
-    _IS_SM9X = torch.cuda.get_device_capability(0)[0] == 9
+    _IS_SM8X = torch.version.cuda is not None and (torch.cuda.get_device_capability(0)[0] == 8)
+    _IS_SM9X = torch.version.cuda is not None and (torch.cuda.get_device_capability(0)[0] == 9)
     _IS_HIPSPARSELT_AVAILABLE = torch.version.hip is not None and tuple(int(v) for v in torch.version.hip.split('.')[:2]) > (6, 4)
     # CUTLASS kernels only work for Ampere
     if _IS_SM8X:
@@ -1240,11 +1240,8 @@ def test_cusparselt_backend(self):
         version = _get_torch_cuda_version()
         assert torch.backends.cusparselt.is_available()
 
-        # CUDA 11.8 has cuSPARSELt v0.4.0 support
-        if version == (11, 8):
-            assert torch.backends.cusparselt.version() == 400
         # PyTorch CUDA 12.4+ using cuSPARSELt v0.6.2+
-        elif version >= (12, 4):
+        if version >= (12, 4):
             assert torch.backends.cusparselt.version() >= 602
         else:
             assert torch.backends.cusparselt.version() is None
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 15c04b8154c3a..03a7e381332a1 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -1151,6 +1151,50 @@ def test_cat2(self, device, dtype):
         z = torch.cat([x, y])
         self.assertEqual(z.size(), (21, SIZE, SIZE))
 
+    @dtypes(torch.float)
+    def test_cat_size1(self, device, dtype):
+        # create a tensor that has aligned stride along dim - 1 dimension
+        # but catted slice size is not aligned
+        x1 = torch.randn(16, 16, device=device, dtype=dtype)[:1, :1]
+        xref = x1.clone().view(-1).view(x1.shape)
+        # make sure output size is aligned, need at least 4 elements for this
+        res = torch.cat([x1, x1, x1, x1], dim=-1)
+        ref = torch.cat([xref, xref, xref, xref], dim=-1)
+        self.assertEqual(res, ref)
+
+    @dtypes(torch.float)
+    def test_cat_trailing_dim(self, device, dtype):
+        x1 = torch.randn(16, 16, 23, device=device, dtype=dtype)
+        x2 = torch.rand_like(x1)
+        res = torch.cat([x1, x2], dim=1)
+        ref = torch.cat([x1.cpu(), x2.cpu()], dim=1)
+        self.assertEqual(res, ref)
+
+    @dtypes(torch.float)
+    def test_cat_misaligned(self, device, dtype):
+        x1 = torch.randn(14, device=device, dtype=dtype)[2:]
+        x2 = torch.rand_like(x1)
+        res = torch.cat([x1, x2], dim=-1)
+        ref = torch.cat([x1.cpu(), x2.cpu()], dim=-1)
+        self.assertEqual(res, ref)
+
+    @dtypes(torch.float)
+    def test_cat_multi_batch(self, device, dtype):
+        xs = [torch.randn(16, 16, device=device, dtype=dtype) for _ in range(130)]
+        xs_cpu = [x.cpu() for x in xs]
+        res = torch.cat(xs, dim=-1)
+        ref = torch.cat(xs_cpu, dim=-1)
+        self.assertEqual(res, ref)
+
+    @dtypes(torch.float)
+    @largeTensorTest("16GB")
+    def test_cat_large_tensor(self, device, dtype):
+        N = 2 ** 32 // dtype.itemsize
+        inps = [torch.randn(N, device=device, dtype=dtype), torch.randn(N // 128, device=device, dtype=dtype)]
+        res = torch.cat(inps, dim=0)
+        ref = torch.cat([x.cpu() for x in inps])
+        self.assertEqual(res, ref)
+
     # FIXME: Create an OpInfo-based tensor creation method test that verifies this for all tensor
     #   creation methods and verify all dtypes and layouts
     @dtypes(torch.bool, torch.uint8, torch.int16, torch.int64, torch.float16, torch.float32, torch.complex64)
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index c6982d319d810..cd527db88441b 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -82,13 +82,14 @@ def tearDown(self):
             if os.path.exists(temp_dir):
                 shutil.rmtree(temp_dir)
 
-    def assertProto(self, str_to_compare):
+    def assertProto(self, actual_proto):
         if expecttest.ACCEPT:
-            write_proto(str_to_compare, self)
+            write_proto(actual_proto, self)
             return True
-        expected = read_expected_content(self)
-        str_to_compare = str(str_to_compare)
-        self.assertEqual(remove_whitespace(str_to_compare), remove_whitespace(expected))
+        expected_str = read_expected_content(self)
+        expected_proto = Summary()
+        text_format.Parse(expected_str, expected_proto)
+        self.assertEqual(actual_proto, expected_proto)
 
     def assertImageProto(self, actual_proto):
         if expecttest.ACCEPT:
diff --git a/test/test_transformers.py b/test/test_transformers.py
index b90b1ed86ef29..7c20600347104 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -51,7 +51,6 @@
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
     tf32_on_and_off,
     tf32_enabled,
-    ROCM_VERSION,
 )
 
 if TEST_FAIRSEQ:
@@ -340,7 +339,7 @@ def test_train_with_pad_and_catch_error(self, device):
                 l1_bool = nn.L1Loss()(test_train_bool[:, 0:2, :], test_eval_bool[:, 0:2, :]).item()
                 self.assertTrue(l1_bool < 1e-4, "Eval/Train difference in pad_mask BOOL")
 
-    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+    @tf32_on_and_off(0.001)
     @parametrize("attn_mask_dim", [2, 3, None])
     @parametrize("key_padding_mask_dim", [2, None])
     @parametrize("mask_dtype", [torch.bool, torch.float32])
@@ -524,7 +523,7 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste
                 slowpath_output = slowpath_output.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
                 self.assertEqual(fastpath_output_expanded, slowpath_output)
 
-    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+    @tf32_on_and_off(0.001)
     @parametrize("with_no_grad", [True, False])
     @parametrize("training", [True, False])
     @parametrize("enable_nested_tensor", [False])
@@ -1110,7 +1109,7 @@ def forward(
                     return_all_hiddens=False,
                 )[0]
 
-    @tf32_on_and_off(0.003, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+    @tf32_on_and_off(0.003)
     @parametrize("input_dim,attn_mask_dim,is_causal",
                  [(3, None, False), (3, 2, False), (3, 2, True), (3, 3, False), (3, 3, True),
                   (4, None, False), (4, 2, False), (4, 2, True), (4, 4, False), (4, 4, True)],
@@ -2823,6 +2822,29 @@ def test_attention(backend: SDPBackend, permute_order: list[list[int]]):
         for permute_order in permute_orders:
             test_attention(SDPBackend.CUDNN_ATTENTION, list(permute_order) + [3])
 
+    @skipIfRocm
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
+    def test_cudnn_attention_compiles(self):
+        q = torch.randn(2, 8, 1024, 128, dtype=torch.half, device='cuda', requires_grad=True)
+        grad = torch.randn_like(q)
+
+        @torch.compile()
+        def func():
+            with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.CUDNN_ATTENTION):
+                out = torch.nn.functional.scaled_dot_product_attention(q, q, q)
+                out.backward(grad)
+            return out
+
+        out = func()
+
+        q_cpu = q.float().cpu().detach().clone()
+        q_cpu.requires_grad = True
+        grad_cpu = grad.cpu().float()
+        out_cpu = torch.nn.functional.scaled_dot_product_attention(q_cpu, q_cpu, q_cpu)
+        out_cpu.backward(grad_cpu)
+        self.assertEqual(out, out_cpu.cuda().half(), atol=1e-3, rtol=1e-3)
+        self.assertEqual(q.grad, q_cpu.grad.cuda().half(), atol=7e-3, rtol=5e-3)
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system")
     @parametrize("mask_dim", [1, 2, 3, 4])
     def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int]):
diff --git a/test/test_transformers_privateuse1.py b/test/test_transformers_privateuse1.py
deleted file mode 100644
index 0aa15260d0949..0000000000000
--- a/test/test_transformers_privateuse1.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Owner(s): ["module: sdpa"]
-
-import unittest
-from collections import namedtuple
-from functools import partial
-
-import torch_openreg  # noqa: F401
-
-import torch
-from torch.nn.attention import SDPBackend
-from torch.testing._internal.common_nn import NNTestCase
-from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TEST_XPU
-
-
-SdpaShape = namedtuple("Sdpa_Shape", ["batch", "num_heads", "seq_len", "head_dim"])
-
-
-@unittest.skipIf(TEST_XPU, "XPU does not support cppextension currently")
-class TestSDPAPrivateUse1Only(NNTestCase):
-    @skipIfTorchDynamo()
-    def test_fused_sdp_choice_privateuseone(self):
-        batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
-        make_tensor = partial(torch.rand, device="cpu", dtype=torch.float16)
-        shape = SdpaShape(batch_size, num_heads, seq_len, head_dim)
-        q_cpu, k_cpu, v_cpu = make_tensor(shape), make_tensor(shape), make_tensor(shape)
-        q_privateuse1 = q_cpu.to("openreg")
-        k_privateuse1 = k_cpu.to("openreg")
-        v_privateuse1 = v_cpu.to("openreg")
-        assert (
-            torch._fused_sdp_choice(q_privateuse1, k_privateuse1, v_privateuse1)
-            == SDPBackend.OVERRIDEABLE.value
-        )
-
-    def test_scaled_dot_product_fused_attention_overrideable(self):
-        batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
-        make_tensor = partial(torch.rand, device="cpu", dtype=torch.float16)
-        shape = SdpaShape(batch_size, num_heads, seq_len, head_dim)
-        q_cpu, k_cpu, v_cpu = make_tensor(shape), make_tensor(shape), make_tensor(shape)
-        q_privateuse1 = q_cpu.to("openreg")
-        k_privateuse1 = k_cpu.to("openreg")
-        v_privateuse1 = v_cpu.to("openreg")
-        torch.nn.functional.scaled_dot_product_attention(
-            q_privateuse1, k_privateuse1, v_privateuse1, attn_mask=None, dropout_p=0.0
-        )
-
-    def test_scaled_dot_product_fused_attention_overrideable_backward(self):
-        batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
-        make_tensor = partial(
-            torch.rand, device="cpu", dtype=torch.float16, requires_grad=True
-        )
-        shape = (batch_size, num_heads, seq_len, head_dim)
-        q_cpu, k_cpu, v_cpu = make_tensor(shape), make_tensor(shape), make_tensor(shape)
-        attn_mask = make_tensor((batch_size, num_heads, seq_len, seq_len))
-        q_privateuse1 = q_cpu.to("openreg")
-        k_privateuse1 = k_cpu.to("openreg")
-        v_privateuse1 = v_cpu.to("openreg")
-        attn_mask_privateuse1 = attn_mask.to("openreg")
-        (
-            output,
-            logsumexp,
-            cum_seq_q,
-            cum_seq_k,
-            max_q,
-            max_k,
-            philox_seed,
-            philox_offset,
-            debug_attn_mask,
-        ) = torch.ops.aten._scaled_dot_product_fused_attention_overrideable(
-            q_privateuse1, k_privateuse1, v_privateuse1, attn_bias=attn_mask_privateuse1
-        )
-
-        rand_upward = torch.rand(
-            shape, device="cpu", dtype=torch.float16, requires_grad=False
-        )
-        rand_upward_privateuse1 = rand_upward.to("openreg")
-        grad_input_mask = [True, True, True, True]
-        grad_q, grad_k, grad_v, grad_attn_mask = (
-            torch.ops.aten._scaled_dot_product_fused_attention_overrideable_backward(
-                rand_upward_privateuse1,
-                q_privateuse1,
-                k_privateuse1,
-                v_privateuse1,
-                attn_mask_privateuse1,
-                grad_input_mask,
-                output,
-                logsumexp,
-                cum_seq_q,
-                cum_seq_k,
-                max_q,
-                max_k,
-                dropout_p=0.0,
-                is_causal=False,
-                philox_seed=philox_seed,
-                philox_offset=philox_offset,
-            )
-        )
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 9939e8e76ce94..15b967e570740 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -1654,6 +1654,15 @@ def test_nonzero_static(self, device):
             ),
         )
 
+        # empty input
+        # https://github.com/pytorch/pytorch/issues/162473
+        input_tensor = torch.tensor([], device=device)
+        static_size = 1
+        self.assertEqual(
+            torch.nonzero_static(input_tensor, size=static_size),
+            torch.tensor([[-1]], device=device),
+        )
+
         # 1D input
         input_tensor = torch.tensor([0, 8], device=device)
         static_size = 1
diff --git a/test/test_utils.py b/test/test_utils.py
index 0314da6e320a1..7c9e4c1d334fa 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -3,7 +3,6 @@
 
 import os
 import random
-import re
 import shutil
 import subprocess
 import sys
@@ -633,151 +632,6 @@ def test_multi_drop(self):
 test_dir = os.path.abspath(os.path.dirname(str(__file__)))
 
 
-@unittest.skipIf(
-    "SKIP_TEST_BOTTLENECK" in os.environ.keys(), "SKIP_TEST_BOTTLENECK is set"
-)
-class TestBottleneck(TestCase):
-    def _run(self, command, timeout=30):
-        """Returns (return-code, stdout, stderr)"""
-        import subprocess
-
-        p = subprocess.Popen(
-            command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            shell=True,
-        )
-        try:
-            output, err = p.communicate(timeout=timeout)
-        except subprocess.TimeoutExpired:
-            p.kill()
-            output, err = p.communicate()
-        rc = p.returncode
-        output_str = output.decode("ascii")
-        err_str = err.decode("ascii")
-        return (rc, output_str, err_str)
-
-    def _run_bottleneck(self, test_file, scriptargs=""):
-        curdir = os.path.dirname(os.path.abspath(__file__))
-        filepath = f"{curdir}/{test_file}"
-        if scriptargs != "":
-            scriptargs = f" {scriptargs}"
-        rc, out, err = self._run(
-            f"{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}"
-        )
-        return rc, out, err
-
-    def _check_run_args(self):
-        # Check that this fails due to missing args
-        rc, out, err = self._run_bottleneck("bottleneck_test/test_args.py")
-        self.assertEqual(
-            rc,
-            2,
-            atol=0,
-            rtol=0,
-            msg=self._fail_msg("Missing args should error", out + err),
-        )
-
-        # This should succeed
-        rc, out, err = self._run_bottleneck(
-            "bottleneck_test/test_args.py", "--foo foo --bar bar"
-        )
-        self.assertEqual(
-            rc,
-            0,
-            atol=0,
-            rtol=0,
-            msg=self._fail_msg("Should pass args to script", out + err),
-        )
-
-    def _fail_msg(self, msg, output):
-        return f"{msg}, output was:\n{output}"
-
-    def _check_environment_summary(self, output):
-        results = re.search("Environment Summary", output)
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have Environment Summary", output)
-        )
-
-        # Up to five lines away from the heading, there should be the version number
-        results = re.search(
-            r"Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+", output
-        )
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have PyTorch version", output)
-        )
-
-    def _check_cprof_summary(self, output):
-        results = re.search("cProfile output", output)
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have cProfile output", output)
-        )
-
-        # This assumes that after the cProfile output section we have
-        # the autograd profiler output
-        results = re.search(
-            r"cProfile output.*(\n.*){6,50}\n.*autograd profiler output", output
-        )
-        self.assertIsNotNone(
-            results,
-            self._fail_msg(
-                "Distance between cProfile and autograd prof out not in [6, 50] lines",
-                output,
-            ),
-        )
-
-    def _check_autograd_summary(self, output):
-        results = re.search("autograd profiler output", output)
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have autograd profiler output", output)
-        )
-
-        # This assumes that after the autograd profiler output is the end of the
-        # output.
-        results = re.search(r"autograd profiler output.*(\n.*){6,100}", output)
-        self.assertIsNotNone(
-            results,
-            self._fail_msg(
-                "Distance between autograd prof output and end of output not in [6, 100] lines",
-                output,
-            ),
-        )
-
-    def _check_cuda(self, output):
-        if HAS_CUDA:
-            results = re.search("CUDA mode", output)
-            self.assertIsNotNone(
-                results, self._fail_msg("Should tell users CUDA", output)
-            )
-        else:
-            results = re.search("CUDA mode", output)
-            self.assertIsNone(
-                results, self._fail_msg("Should not tell users about CUDA", output)
-            )
-
-    @unittest.skipIf(HAS_CUDA, "CPU-only test")
-    def test_bottleneck_cpu_only(self):
-        rc, out, err = self._run_bottleneck("bottleneck_test/test.py")
-        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
-
-        self._check_run_args()
-        self._check_environment_summary(out)
-        self._check_autograd_summary(out)
-        self._check_cprof_summary(out)
-        self._check_cuda(out)
-
-    @unittest.skipIf(not HAS_CUDA, "No CUDA")
-    def test_bottleneck_cuda(self):
-        rc, out, err = self._run_bottleneck("bottleneck_test/test_cuda.py")
-        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
-
-        self._check_run_args()
-        self._check_environment_summary(out)
-        self._check_autograd_summary(out)
-        self._check_cprof_summary(out)
-        self._check_cuda(out)
-
-
 from torch.utils.collect_env import get_pretty_env_info
 
 
diff --git a/test/test_xpu.py b/test/test_xpu.py
index 04d045b00d8bc..3474e4031ef23 100644
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@@ -585,6 +585,16 @@ def test_get_arch_list(self):
         for arch in arch_list:
             self.assertTrue(arch in flags)
 
+    @unittest.skipIf(not TEST_MULTIXPU, "only one GPU detected")
+    def test_can_device_access_peer(self):
+        device_count = torch.xpu.device_count()
+        for device in range(device_count):
+            for peer in range(device_count):
+                self.assertEqual(
+                    torch.xpu.can_device_access_peer(device, peer),
+                    torch.xpu.can_device_access_peer(peer, device),
+                )
+
     def test_torch_version_xpu(self):
         self.assertEqual(len(torch.version.xpu), 8)
         compiler_version = int(torch.version.xpu)
diff --git a/test/xpu/test_gemm.py b/test/xpu/test_gemm.py
index 1164a2b676368..f2a273ccc3300 100644
--- a/test/xpu/test_gemm.py
+++ b/test/xpu/test_gemm.py
@@ -19,8 +19,12 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
+    onlyNativeDeviceTypes,
     precisionOverride,
 )
+from torch.testing._internal.common_quantization import (
+    _dynamically_quantize_per_channel,
+)
 from torch.testing._internal.common_utils import (
     iter_indices,
     parametrize,
@@ -1446,6 +1450,50 @@ def forward(self, x_1, w_1):
     return out_dtype""",
         )
 
+    @onlyNativeDeviceTypes
+    @parametrize("m", [32, 64])
+    @parametrize("k", [32, 64])
+    @parametrize("n", [48, 64])
+    @parametrize("compile", [True, False])
+    @parametrize("slice", [True, False])
+    def test__int8_mm(self, device, m, k, n, compile, slice):
+        torch.manual_seed(1)
+        if slice:
+            # logits are generated from LLaMA LM head like this -
+            # the activation to LM head is a slice of final hidden state
+            # of shape (batch_size, sequence_length, hidden dim),
+            # but is non-contiguous
+            # Using arbitrary batch-size here, since it'd be converted to 2D
+            batch_size = 4
+            a = torch.rand((batch_size, m, k), dtype=torch.bfloat16, device=device)
+            # Make a non-contiguous
+            a = a[:, -1:, :]
+            a = a.view(-1, a.size(-1))
+        else:
+            a = torch.rand((m, k), dtype=torch.bfloat16, device=device)
+
+        b = torch.rand((n, k), dtype=torch.bfloat16, device=device)
+
+        def convert_weight_to_int8pack(b):
+            b_int8pack, b_scales, _ = _dynamically_quantize_per_channel(
+                b, -128, 127, torch.int8
+            )
+            return b_int8pack, b_scales
+
+        def weight_int8pack_mm(a, b_int8pack, b_scales):
+            return torch._weight_int8pack_mm(a, b_int8pack, b_scales)
+
+        b_int8pack, b_scales = convert_weight_to_int8pack(b)
+        if compile:
+            mod = torch.compile(weight_int8pack_mm)
+        else:
+            mod = weight_int8pack_mm
+        res = mod(a, b_int8pack, b_scales)
+        ref = torch.mm(a, b.transpose(0, 1))
+
+        mean_err = ((res - ref).abs() / ref).mean()
+        self.assertTrue(mean_err < 0.05)
+
 
 instantiate_device_type_tests(TestBasicGEMM, globals(), only_for="xpu", allow_xpu=True)
 
diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
index 1a7b4b78db447..f937055efc6d4 160000
--- a/third_party/cudnn_frontend
+++ b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit 1a7b4b78db44712fb9707d21cd2e3179f1fd88b8
+Subproject commit f937055efc6d414d11f4c6577e3977fe74f35fb6
diff --git a/third_party/cutlass b/third_party/cutlass
index e51efbfe18fe4..57e3cfb47a2d9 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit e51efbfe18fe4f4cbb66ab814c55bf4aa0185491
+Subproject commit 57e3cfb47a2d9e0d46eb6335c3dc411498efa198
diff --git a/third_party/fbgemm b/third_party/fbgemm
index 4b39c551efe15..3cefe0564a8c3 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 4b39c551efe15e6bbade20565b0ceb2d8ce3352d
+Subproject commit 3cefe0564a8c3de514a152d40a2b4770f2ee5be0
diff --git a/third_party/fmt b/third_party/fmt
index 40626af88bd7d..e424e3f2e607d 160000
--- a/third_party/fmt
+++ b/third_party/fmt
@@ -1 +1 @@
-Subproject commit 40626af88bd7df9a5fb80be7b25ac85b122d6c21
+Subproject commit e424e3f2e607da02742f73db84873b8084fc714c
diff --git a/third_party/gloo b/third_party/gloo
index c7b7b022c124d..54cbae0d3a67f 160000
--- a/third_party/gloo
+++ b/third_party/gloo
@@ -1 +1 @@
-Subproject commit c7b7b022c124d9643957d9bd55f57ac59fce8fa2
+Subproject commit 54cbae0d3a67fa890b4c3d9ee162b7860315e341
diff --git a/third_party/kineto b/third_party/kineto
index 5e7501833f102..001ba8eb51943 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit 5e7501833f1021ce6f618572d3baf657b6319658
+Subproject commit 001ba8eb519438592f79dbc8e86a349f5f6c6829
diff --git a/third_party/miniz-3.0.2/miniz.c b/third_party/miniz-3.0.2/miniz.c
index bd6b9f8562255..0f0cf1833b6da 100644
--- a/third_party/miniz-3.0.2/miniz.c
+++ b/third_party/miniz-3.0.2/miniz.c
@@ -3136,6 +3136,7 @@ extern "C" {
 
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
+#include <share.h>
 
 static WCHAR* mz_utf8z_to_widechar(const char* str)
 {
@@ -3149,11 +3150,13 @@ static FILE *mz_fopen(const char *pFilename, const char *pMode)
 {
   WCHAR* wFilename = mz_utf8z_to_widechar(pFilename);
   WCHAR* wMode = mz_utf8z_to_widechar(pMode);
-  FILE* pFile = NULL;
-  errno_t err = _wfopen_s(&pFile, wFilename, wMode);
+  /*
+  Must use _wfsopen with _SH_DENYNO on Windows, to open opened temp files.
+  */
+  FILE* pFile = _wfsopen(wFilename, wMode, _SH_DENYNO);
   free(wFilename);
   free(wMode);
-  return err ? NULL : pFile;
+  return pFile;
 }
 
 static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStream)
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index 74925f898e74b..f8e8503faf2c9 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1 @@
-83c5a5a5516d498dde2ae131ca2d10a4abb94cfb
+24fab67b6ecf7620d0cf776047a3056c5b518bab
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/README.md b/tools/experimental/dynamic_shapes/torchfuzz/README.md
new file mode 100644
index 0000000000000..bae356a27c712
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/README.md
@@ -0,0 +1,206 @@
+# TorchFuzz - Torch Compile e2e Fuzz Testing Tool
+
+TorchFuzz is an experimental fuzzing framework for testing PyTorch operations. It generates
+random operation graphs, converts them to executable Python code, and tests them with both eager
+mode and different configurations of `torch.compile()`
+
+## Overview
+
+TorchFuzz works by:
+1. **Generating random tensor/scalar specifications** with various shapes, strides, and dtypes
+2. **Creating operation graphs** that produce the target specification, by recursively fuzzing operations with proper dependency tracking. Each operation node specifies its inputs and outputs, forming a Directed Acyclic Graph (DAG) of PyTorch operations.
+3. **Converting operation graphs to executable Python code** using topological ordering to ensure dependencies are executed before dependents
+4. **Testing both eager and compiled execution** The generated code includes multiple torch.compile configurations
+
+### Example Walkthrough
+
+Here's a concrete example of how TorchFuzz generates a test:
+
+**Target Spec Generated:** `TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32)`
+
+**Operation Graph Created:**
+```
+OperationGraph (root: node_0, target: TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32))
+  node_2: arg_0 -> TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32) (depth 0)
+  node_3: arg_1 -> TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32) (depth 0)
+  node_0: torch.ops.aten.add -> TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32) (depth 2) <- ['node_2', 'node_3']
+```
+
+**Generated Python Code:**
+```python
+import torch
+import sys
+import os
+# Add fuzzer directory to path so we can import tensor_fuzzer
+fuzzer_dir = r'/home/lsakka/pytorch/tools/experimental/dynamic_shapes/torchfuzz'
+if fuzzer_dir not in sys.path:
+    sys.path.insert(0, fuzzer_dir)
+from tensor_fuzzer import fuzz_scalar, fuzz_tensor_simple, ScalarSpec, TensorSpec
+
+# Generated fuzzed program code (topological order from operation graph)
+# Graph has 3 nodes
+
+def fuzzed_program(arg_0, arg_1):
+    # Node node_2: arg_0 (depth 0)
+    var_node_2 = arg_0
+
+    # Node node_3: arg_1 (depth 0)
+    var_node_3 = arg_1
+
+    # Node node_0: torch.ops.aten.add (depth 2)
+    var_node_0 = torch.ops.aten.add(var_node_2, var_node_3)
+
+    # Final result from root node
+    return var_node_0
+
+# Create arguments for the fuzzed program
+arg_0 = fuzz_tensor_simple((2, 3), (3, 1), torch.float32, seed=10000)
+arg_1 = fuzz_tensor_simple((2, 3), (3, 1), torch.float32, seed=10001)
+
+# Execute the fuzzed program both normally and with torch.compile
+# (execution code omitted for brevity)
+```
+
+## Graph Visualization
+![Operation Graph](image.png)
+
+## Quick Start
+
+### Single Test Run
+
+```bash
+cd tools/experimental/dynamic_shapes/torchfuzz
+python fuzzer.py --single --seed 42
+```
+Note: Given a seed, the fuzzer is guanteed to generate the same program. (on the same gh commit).
+
+### Continuous Fuzzing
+
+```bash
+python fuzzer.py --test --seed 1000 --max-depth 5
+```
+
+### With Debug Output
+
+```bash
+python fuzzer.py --single --log-level DEBUG --seed 42
+```
+
+## Command Line Options
+
+| Option | Description |
+|--------|-------------|
+| `--single` | Run a single fuzz test (default: run continuous loop) |
+| `--test` | Run continuous fuzzing loop |
+| `--seed INT` | Set random seed for reproducible tests |
+| `--max-depth INT` | Maximum operation stack depth (1-20) |
+| `--log-level LEVEL` | Set logging level (DEBUG, INFO, WARNING, ERROR) |
+
+
+### Core Components
+
+1. **`tensor_fuzzer.py`** - Generates random tensor specifications (shapes, strides, dtypes)
+2. **`ops_fuzzer.py`** - Creates operation graphs with type-aware operations and dependency tracking
+3. **`codegen.py`** - Converts operation graphs to executable Python code using topological ordering
+4. **`fuzzer.py`** - Main orchestrator and CLI interface
+5. **`visualize_graph.py`** - Creates visual diagrams of operation graphs and dependency structures
+
+### Operation Types
+
+**Tensor Operations:**
+Write now the supported ops are very limited, its should be easy to extend that.
+The fuzzer already very simple was able to cartch three bugs already.
+
+- `torch.ops.aten.add` - Element-wise tensor addition
+- `torch.ops.aten.mul` - Element-wise tensor multiplication
+- `arg` - Function arguments (input tensors)
+
+**Scalar Operations:**
+- `scalar_add` - Python scalar addition
+- `scalar_multiply` - Python scalar multiplication
+- `torch.ops.aten.item` - Extract scalar from 1-element tensor
+- `constant` - Generate constant values
+
+## Example Output
+
+```
+Using seed: 42
+Using max_depth: 3
+⏱️  Step 1: Generating target spec...
+   Completed in 0.001s - TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32)
+⏱️  Step 2: Generating operation stack...
+   Completed in 0.002s - 5 operations
+⏱️  Step 3: Converting to Python code...
+   Completed in 0.003s - 1247 chars
+⏱️  Step 4: Executing Python code...
+📄 Generated code written to: /tmp/tmpXXXXX_generated.py
+🚀 Executing: python /tmp/tmpXXXXX_generated.py (timeout: 300s)
+=== Executing Original Program ===
+✅ Original execution successful
+=== Executing Compiled Program fullgraph=False ===
+✅ Compiled execution successful
+=== Executing Compiled Program dynamic=True ===
+✅ Compiled execution successful
+✅ SUCCESS - artifacts saved to: /tmp/fuzzing_seed_42_1695123456789_success
+```
+
+
+## Known Issues Handling
+
+TorchFuzz automatically skips known PyTorch issues or previously found, you should add them to the list known_issues.
+
+```python
+known_issues = {
+    "RuntimeError: self.stride(-1) must be 1 to view ComplexDouble as":
+        "https://github.com/pytorch/pytorch/issues/162561",
+    "BooleanAtom not allowed in this context":
+        "https://github.com/pytorch/pytorch/issues/160726",
+}
+```
+
+## API Usage
+
+### Programmatic Interface
+
+```python
+from fuzzer import fuzz_and_execute
+from ops_fuzzer import fuzz_operation_graph, fuzz_spec
+from codegen import convert_graph_to_python_code
+
+# Generate and execute a single test
+seed, success, error = fuzz_and_execute(seed=42, max_depth=3)
+
+# Generate operation graph only
+target_spec = fuzz_spec()
+operation_graph = fuzz_operation_graph(target_spec, max_depth=3, seed=42)
+
+# Generate code without executing
+python_code = convert_graph_to_python_code(operation_graph, seed=42)
+
+# Explore graph structure
+print(f"Graph has {len(operation_graph.nodes)} nodes")
+print(f"Root node: {operation_graph.root_node_id}")
+print(f"Topological order: {operation_graph.get_topological_order()}")
+print(f"Leaf nodes: {operation_graph.get_leaf_nodes()}")
+```
+
+
+## Contributing
+
+### Adding New Operations
+
+1. **Define operation in `ops_fuzzer.py`**:
+   ```python
+   def _get_new_op_args_specs(target_spec):
+       return "new_op_name", [input_spec1, input_spec2]
+   ```
+
+2. **Add code generation in `codegen.py`**:
+   ```python
+   elif op_name == "new_op_name":
+       return [f"{output_var} = torch.ops.aten.new_op({input_vars[0]}, {input_vars[1]})"]
+   ```
+
+
+3. **Update operation selection in `fuzz_op()`**
+TODO: link an example PR that adds an operation.
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/__init__.py b/tools/experimental/dynamic_shapes/torchfuzz/__init__.py
new file mode 100644
index 0000000000000..20f00ec7c4c12
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/__init__.py
@@ -0,0 +1,19 @@
+"""Torchfuzz package for generating and testing random PyTorch operations."""
+
+# Make key classes available at package level
+from .operators import get_operator, list_operators, register_operator
+from .ops_fuzzer import fuzz_operation_graph, fuzz_spec, OperationGraph
+from .tensor_fuzzer import ScalarSpec, Spec, TensorSpec
+
+
+__all__ = [
+    "TensorSpec",
+    "ScalarSpec",
+    "Spec",
+    "OperationGraph",
+    "fuzz_operation_graph",
+    "fuzz_spec",
+    "get_operator",
+    "register_operator",
+    "list_operators",
+]
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/codegen.py b/tools/experimental/dynamic_shapes/torchfuzz/codegen.py
new file mode 100644
index 0000000000000..d9989ab342c0d
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/codegen.py
@@ -0,0 +1,511 @@
+# mypy: ignore-errors
+import os
+import signal
+import subprocess
+import sys
+import tempfile
+import time
+from queue import Empty, Queue
+from threading import Thread
+from typing import Any, Optional, Union
+
+import torch
+
+from torchfuzz.operators import get_operator
+from torchfuzz.ops_fuzzer import OperationGraph
+from torchfuzz.tensor_fuzzer import ScalarSpec, Spec, TensorSpec
+
+
+def convert_graph_to_python_code(
+    operation_graph: OperationGraph, seed: Optional[int] = None
+) -> str:
+    """
+    Convert an operation graph to executable Python code using topological ordering.
+
+    The graph-based approach generates code by:
+    1. Getting the topological order of nodes (dependencies before dependents)
+    2. Generating code for each node in that order
+    3. Properly handling input dependencies through node connections
+
+    Args:
+        operation_graph: OperationGraph instance containing the operation DAG
+        seed: Random seed for reproducible code generation. If None, uses current random state.
+
+    Returns:
+        String containing the complete Python code that executes the operations
+    """
+
+    # Set seed for reproducible code generation
+    if seed is not None:
+        import random
+
+        random.seed(seed + 1000)  # Offset to avoid conflicts with graph generation
+        torch.manual_seed(seed + 1000)
+
+    if not operation_graph.nodes:
+        raise ValueError("Empty operation graph")
+
+    # Get topological order - this ensures dependencies are processed before dependents
+    topo_order = operation_graph.get_topological_order()
+
+    # Track generated variables and arg operations
+    generated_code_lines = []
+    node_variables: dict[str, tuple[str, Spec]] = {}  # Maps node_id to (var_name, spec)
+    arg_operations: list[
+        tuple[str, Spec]
+    ] = []  # List of (node_id, spec) for arg operations
+
+    # Process nodes in topological order
+    for node_id in topo_order:
+        node = operation_graph.nodes[node_id]
+        op_name = node.op_name
+        output_spec = node.output_spec
+
+        # Generate comment for this operation
+        generated_code_lines.append(
+            f"    # Node {node_id}: {op_name} (depth {node.depth})"
+        )
+
+        # Generate output variable name
+        output_var_name = f"var_{node_id}"
+
+        # Generate input variable names from input nodes
+        input_var_names = []
+        for input_node_id in node.input_nodes:
+            if input_node_id in node_variables:
+                input_var_name, _ = node_variables[input_node_id]
+                input_var_names.append(input_var_name)
+            else:
+                raise ValueError(
+                    f"Node {node_id} depends on {input_node_id}, but {input_node_id} "
+                    f"was not processed yet. Topological order may be incorrect."
+                )
+
+        # Handle different operation types
+        if op_name == "arg" or op_name.startswith("arg_"):
+            # Track arg operations for later function signature generation
+            arg_operations.append((node_id, output_spec))
+            arg_name = f"arg_{len(arg_operations) - 1}"
+            operation_lines = [f"{output_var_name} = {arg_name}"]
+        else:
+            # Generate operation execution code
+            operation_lines = generate_simple_operation_code(
+                output_var_name, input_var_names, op_name, output_spec
+            )
+
+        # Add proper indentation for function body
+        generated_code_lines.extend(["    " + line for line in operation_lines])
+        generated_code_lines.append("")
+
+        # Track this node's variable
+        node_variables[node_id] = (output_var_name, output_spec)
+
+    # The final result comes from the root node
+    root_node_id = operation_graph.root_node_id
+    if root_node_id not in node_variables:
+        raise ValueError(f"Root node {root_node_id} was not processed")
+
+    final_var_name, _ = node_variables[root_node_id]
+
+    # Generate function signature based on discovered arg operations
+    if arg_operations:
+        arg_names = [f"arg_{i}" for i in range(len(arg_operations))]
+        function_signature = f"def fuzzed_program({', '.join(arg_names)})"
+    else:
+        function_signature = "def fuzzed_program()"
+
+    # Build the complete code
+    fuzzer_dir = os.path.dirname(os.path.abspath(__file__))
+    code_lines = [
+        "import torch",
+        "import sys",
+        "import os",
+        "# Add fuzzer directory to path so we can import tensor_fuzzer",
+        f"fuzzer_dir = r'{fuzzer_dir}'",
+        "if fuzzer_dir not in sys.path:",
+        "    sys.path.insert(0, fuzzer_dir)",
+        "from tensor_fuzzer import fuzz_scalar, fuzz_tensor_simple, ScalarSpec, TensorSpec",
+        "",
+        "# Generated fuzzed program code (topological order from operation graph)",
+        f"# Graph has {len(operation_graph.nodes)} nodes",
+        "",
+        function_signature + ":",
+    ]
+
+    # Add the generated operation code
+    code_lines.extend(generated_code_lines)
+
+    # Add return statement
+    code_lines.extend(
+        [
+            "    # Final result from root node",
+            f"    return {final_var_name}",
+            "",
+        ]
+    )
+
+    # Generate argument creation code with deterministic seeds
+    if arg_operations:
+        code_lines.append("# Create arguments for the fuzzed program")
+        for i, (node_id, spec) in enumerate(arg_operations):
+            arg_name = f"arg_{i}"
+            # Use a deterministic seed based on the argument index and main seed
+            arg_seed = (seed + 10000 + i) if seed is not None else None
+
+            if isinstance(spec, ScalarSpec):
+                dtype_str = f"torch.{spec.dtype}".replace("torch.torch.", "torch.")
+                if arg_seed is not None:
+                    code_lines.extend(
+                        [
+                            f"scalar_spec = ScalarSpec(dtype={dtype_str})",
+                            f"{arg_name} = fuzz_scalar(scalar_spec, seed={arg_seed})",
+                        ]
+                    )
+                else:
+                    code_lines.extend(
+                        [
+                            f"scalar_spec = ScalarSpec(dtype={dtype_str})",
+                            f"{arg_name} = fuzz_scalar(scalar_spec)",
+                        ]
+                    )
+            elif isinstance(spec, TensorSpec):
+                size_str = str(spec.size)
+                stride_str = str(spec.stride)
+                dtype_str = f"torch.{spec.dtype}".replace("torch.torch.", "torch.")
+                if arg_seed is not None:
+                    code_lines.append(
+                        f"{arg_name} = fuzz_tensor_simple({size_str}, {stride_str}, {dtype_str}, seed={arg_seed})"
+                    )
+                else:
+                    code_lines.append(
+                        f"{arg_name} = fuzz_tensor_simple({size_str}, {stride_str}, {dtype_str})"
+                    )
+
+    # Generate the final execution with both normal and compiled versions
+    if arg_operations:
+        arg_names = [f"arg_{i}" for i in range(len(arg_operations))]
+        if len(arg_names) == 1:
+            args_tuple = (
+                f"({arg_names[0]},)"  # Single element tuple needs trailing comma
+            )
+        else:
+            args_tuple = f"({', '.join(arg_names)})"
+    else:
+        args_tuple = "()"
+
+    code_lines.extend(
+        [
+            "",
+            "# Execute the fuzzed program both normally and with torch.compile",
+            "import torch",
+            "import tempfile",
+            "import os",
+            "import sys",
+            "import contextlib",
+            "from io import StringIO",
+            "",
+            "# Create arguments",
+            f"args = {args_tuple}",
+            "",
+            "# Execute original version",
+            "print('=== Executing Original Program ===')",
+            "try:",
+            "    result_original = fuzzed_program(*args)",
+            "    print('✅ Original execution successful')",
+            "except Exception as e:",
+            "    print(f'❌ Original execution failed: {e}')",
+            "    raise",
+            "",
+            "# Execute compiled version",
+            "print('\\n=== Executing Compiled Program  fullgraph=False')",
+            "try:",
+            "    compiled_program = torch.compile(fuzzed_program, fullgraph=False)",
+            "    result_compiled = compiled_program(*args)",
+            "    print('✅ Compiled execution successful')",
+            "    print(f'Compiled result type: {type(result_compiled)}')",
+            "except Exception as e:",
+            "    print(f'❌ Compiled execution failed: {e}')",
+            "    # Exit with non-zero code to signal compile failure",
+            "    import sys",
+            "    sys.exit(1)",
+            "",
+            "# Execute compiled version 2",
+            "print('\\n=== Executing Compiled Program  fullgraph=False dynamic=True')",
+            "try:",
+            "    compiled_program = torch.compile(fuzzed_program, fullgraph=False, dynamic=True)",
+            "    result_compiled = compiled_program(*args)",
+            "    print('✅ Compiled execution successful')",
+            "    print(f'Compiled result type: {type(result_compiled)}')",
+            "except Exception as e:",
+            "    print(f'❌ Compiled execution failed: {e}')",
+            "    # Exit with non-zero code to signal compile failure",
+            "    import sys",
+            "    sys.exit(1)",
+            "",
+            "# Execute compiled version 3",
+            "print('\\n=== Executing Compiled Program  fullgraph=True dynamic=True')",
+            "try:",
+            "    with torch._dynamo.config.patch(capture_scalar_outputs=True):",
+            "       compiled_program = torch.compile(fuzzed_program, fullgraph=False, dynamic=True)",
+            "       result_compiled = compiled_program(*args)",
+            "       print('✅ Compiled execution successful')",
+            "       print(f'Compiled result type: {type(result_compiled)}')",
+            "except Exception as e:",
+            "    print(f'❌ Compiled execution failed: {e}')",
+            "    # Exit with non-zero code to signal compile failure",
+            "    import sys",
+            "    sys.exit(1)",
+            "",
+        ]
+    )
+
+    return "\n".join(code_lines)
+
+
+def generate_simple_operation_code(
+    output_var: str,
+    input_vars: list,
+    op_name: str,
+    output_spec,
+) -> list:
+    """
+    Generate code lines for executing a single operation using class-based operators.
+
+    Args:
+        output_var: Name of the output variable
+        input_vars: List of input variable names
+        op_name: Name of the operation
+        output_spec: Output specification for the operation
+    """
+    # Try to get the operator from the registry
+    operator = get_operator(op_name)
+
+    if operator is not None:
+        # Use the class-based operator to generate code
+        code_line = operator.codegen(output_var, input_vars, output_spec)
+        return [code_line]
+    else:
+        # Fallback for unknown operations
+        return [f"# Unknown operation: {op_name}"]
+
+
+def execute_python_code(
+    python_code: str, target_spec, preserve_temp_file: bool = False, timeout: int = 60
+) -> Union[torch.Tensor, float, int, bool, complex]:
+    """
+    Execute the generated Python code by writing it to a file and running it.
+    Supports both real-time output printing and output capturing with proper process termination.
+
+    Args:
+        python_code: String containing Python code to execute
+        target_spec: Expected output specification for validation
+        preserve_temp_file: If True, don't delete the temporary file after execution
+        timeout: Maximum time in seconds to wait for execution (default: 60)
+
+    Returns:
+        The actual result from executing the generated code
+
+    Raises:
+        RuntimeError: With full stdout/stderr output if execution fails
+        TimeoutError: If execution exceeds the timeout
+    """
+
+    # Write the generated code to a temporary file
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix="_generated.py", delete=False
+    ) as f:
+        f.write(python_code)
+        generated_file_path = f.name
+
+    print(f"📄 Generated code written to: {generated_file_path}")
+
+    process = None
+    stdout_thread = None
+    stderr_thread = None
+
+    def stream_reader(
+        stream: Any, queue: "Queue[tuple[str, str]]", stream_name: str
+    ) -> None:
+        """Read from stream and put lines in queue with stream identifier"""
+        try:
+            for line in iter(stream.readline, ""):
+                if line:
+                    queue.put((stream_name, line.rstrip("\n")))
+        except Exception:
+            pass
+        finally:
+            try:
+                stream.close()
+            except Exception:
+                pass
+
+    def kill_process_tree(process):
+        """Kill the process and all its children"""
+        try:
+            # Try to terminate gracefully first
+            if process.poll() is None:
+                process.terminate()
+                try:
+                    process.wait(timeout=5)
+                    print("🔄 Process terminated gracefully")
+                except subprocess.TimeoutExpired:
+                    # Force kill if it doesn't terminate gracefully
+                    print("💀 Force killing process...")
+                    process.kill()
+                    try:
+                        process.wait(timeout=5)
+                        print("💀 Process force killed")
+                    except subprocess.TimeoutExpired:
+                        print("⚠️  Process may still be running after force kill")
+
+            # Also try to kill process group if it was created
+            try:
+                pid = process.pid
+                os.killpg(os.getpgid(pid), signal.SIGTERM)
+                time.sleep(2)
+                os.killpg(os.getpgid(pid), signal.SIGKILL)
+            except OSError:
+                pass  # Process group might not exist or already killed
+
+        except Exception as e:
+            print(f"⚠️  Error killing process: {e}")
+
+    try:
+        # Execute the generated file with real-time output streaming
+        print(f"🚀 Executing: python {generated_file_path} (timeout: {timeout}s)")
+        print("=" * 50)
+
+        # Start process with new process group to enable killing child processes
+        process = subprocess.Popen(
+            [sys.executable, generated_file_path],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1,  # Line buffered
+            universal_newlines=True,
+            preexec_fn=os.setsid,  # Create new process group  # noqa: PLW1509
+        )
+
+        # Create queues and threads for reading stdout and stderr
+        output_queue = Queue()
+        stdout_thread = Thread(
+            target=stream_reader, args=(process.stdout, output_queue, "stdout")
+        )
+        stderr_thread = Thread(
+            target=stream_reader, args=(process.stderr, output_queue, "stderr")
+        )
+
+        stdout_thread.daemon = True
+        stderr_thread.daemon = True
+        stdout_thread.start()
+        stderr_thread.start()
+
+        # Collect output while printing in real-time
+        captured_stdout = []
+        captured_stderr = []
+        start_time = time.time()
+
+        # Read output until process finishes or timeout
+        while process.poll() is None:
+            # Check for timeout
+            if time.time() - start_time > timeout:
+                print(f"⏰ Execution timeout ({timeout}s) reached, killing process...")
+                kill_process_tree(process)
+                raise TimeoutError(f"Execution exceeded {timeout} seconds timeout")
+
+            try:
+                stream_name, line = output_queue.get(timeout=0.1)
+                if stream_name == "stdout":
+                    print(line)  # Print to console in real-time
+                    captured_stdout.append(line)
+                elif stream_name == "stderr":
+                    print(line, file=sys.stderr)  # Print to stderr in real-time
+                    captured_stderr.append(line)
+            except Empty:
+                continue
+
+        # Process has finished, collect any remaining output
+        timeout_remaining = max(0, timeout - (time.time() - start_time))
+        output_timeout = min(5, timeout_remaining)  # Max 5 seconds for remaining output
+
+        end_time = time.time() + output_timeout
+        while not output_queue.empty() and time.time() < end_time:
+            try:
+                stream_name, line = output_queue.get(timeout=0.1)
+                if stream_name == "stdout":
+                    print(line)
+                    captured_stdout.append(line)
+                elif stream_name == "stderr":
+                    print(line, file=sys.stderr)
+                    captured_stderr.append(line)
+            except Empty:
+                break
+
+        # Wait for threads to finish with timeout
+        if stdout_thread.is_alive():
+            stdout_thread.join(timeout=2)
+        if stderr_thread.is_alive():
+            stderr_thread.join(timeout=2)
+
+        # Get the return code
+        return_code = process.returncode
+
+        print("=" * 50)
+        print(f"🏁 Process finished with return code: {return_code}")
+
+        if return_code == 0:
+            # Success - we already printed output in real-time
+            if preserve_temp_file:
+                print(f"📁 Temporary file preserved at: {generated_file_path}")
+            return True
+        else:
+            # Failed execution
+            full_output = ""
+            if captured_stdout:
+                full_output += "STDOUT:\n" + "\n".join(captured_stdout) + "\n"
+            if captured_stderr:
+                full_output += "STDERR:\n" + "\n".join(captured_stderr) + "\n"
+            full_output += f"Return code: {return_code}\n"
+
+            print(f"❌ Generated file execution failed with return code {return_code}")
+            if preserve_temp_file:
+                print(f"📁 Failed execution file preserved at: {generated_file_path}")
+            raise RuntimeError(full_output)
+
+    except TimeoutError:
+        # Re-raise timeout error as-is
+        raise
+    except Exception as e:
+        if hasattr(e, "returncode"):
+            # This was a CalledProcessError-like exception
+            raise e
+        else:
+            # Some other error occurred
+            print(f"❌ Execution error: {e}")
+            if preserve_temp_file:
+                print(f"📁 Error execution file preserved at: {generated_file_path}")
+            raise RuntimeError(f"Execution failed: {e}") from e
+    finally:
+        # Ensure process and threads are properly cleaned up
+        try:
+            if process is not None:
+                kill_process_tree(process)
+        except Exception:
+            pass
+
+        # Force cleanup threads if they're still running
+        try:
+            if stdout_thread is not None and stdout_thread.is_alive():
+                stdout_thread.join(timeout=1)
+            if stderr_thread is not None and stderr_thread.is_alive():
+                stderr_thread.join(timeout=1)
+        except Exception:
+            pass
+
+        # Clean up the temporary file unless preservation is requested
+        if not preserve_temp_file:
+            try:
+                os.unlink(generated_file_path)
+                print(f"🗑️  Temporary file cleaned up: {generated_file_path}")
+            except Exception:
+                pass
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/fuzzer.py b/tools/experimental/dynamic_shapes/torchfuzz/fuzzer.py
new file mode 100644
index 0000000000000..7ddb40081a794
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/fuzzer.py
@@ -0,0 +1,302 @@
+# mypy: ignore-errors
+import logging
+import multiprocessing as mp
+import os
+import random
+import sys
+from typing import Any, Optional, Union
+
+
+# Add parent directory to path so we can import torchfuzz as a module
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+if parent_dir not in sys.path:
+    sys.path.insert(0, parent_dir)
+
+import torch
+from torchfuzz.codegen import convert_graph_to_python_code, execute_python_code
+from torchfuzz.ops_fuzzer import fuzz_operation_graph, fuzz_spec
+from torchfuzz.visualize_graph import visualize_operation_graph
+
+
+def fuzz_and_execute(
+    seed: Optional[int] = None,
+    max_depth: Optional[int] = None,
+    log_at_faluire: bool = False,
+) -> tuple[int, Union[bool, Any], Optional[str]]:
+    """
+    Generate a fuzzed operation stack, convert it to Python code, and execute it.
+
+    Args:
+        seed: Random seed for reproducible generation. If None, uses a random seed.
+        max_depth: Maximum depth for operation stack (1-10). If None, uses a random depth.
+
+    Returns:
+        tuple: (seed_used, success_status, error_message)
+            - seed_used: The actual seed that was used for generation
+            - success_status: True if execution succeeded, False if it failed
+            - error_message: Error message if failed, None if succeeded
+
+    This function:
+    1. Generates a random target specification
+    2. Creates a stack of operations to produce that target
+    3. Converts the stack into executable Python code
+    4. Executes the generated Python code
+    5. Validates the final result matches the target spec
+    """
+
+    # Generate seed if not provided
+    if seed is None:
+        seed = random.randint(0, 2**31 - 1)
+
+    # Generate max_depth if not provided (range 3-12)
+    if max_depth is None:
+        random.seed(seed + 999)  # Use seed offset for consistent depth selection
+        max_depth = random.randint(3, 12)
+    else:
+        # Clamp max_depth to valid range
+        max_depth = max(1, max_depth)
+
+    print(f"Using seed: {seed}")
+    print(f"Using max_depth: {max_depth}")
+
+    # Set seed for reproducible generation
+    random.seed(seed)
+    torch.manual_seed(seed)
+    operation_stack = None
+    python_code = None
+    result = None
+    target_spec = None
+
+    def log(success: bool) -> None:
+        import os
+        import time
+
+        # Create a unique folder for this iteration
+        timestamp = int(time.time() * 1000)  # milliseconds
+        folder_name = (
+            f"fuzzing_seed_{seed}_{timestamp}_{'success' if success else 'failed'}"
+        )
+        iteration_folder = os.path.join("/tmp", folder_name)
+        os.makedirs(iteration_folder, exist_ok=True)
+
+        if success:
+            print(f"✅ SUCCESS - artifacts saved to: {iteration_folder}")
+        else:
+            print(f"❌ FAILED - artifacts saved to: {iteration_folder}")
+
+        # Write summary file
+        summary_path = os.path.join(iteration_folder, "summary.txt")
+        with open(summary_path, "w") as f:
+            f.write("Fuzzing Session Summary\n")
+            f.write("======================\n")
+            f.write(f"Seed: {seed}\n")
+            f.write(f"Max depth: {max_depth}\n")
+            f.write(f"Success: {success}\n")
+            f.write(f"Target specification: {target_spec}\n")
+            if operation_stack:
+                f.write(f"Operations count: {len(operation_stack)}\n")
+
+        if operation_stack:
+            # Write operation stack to file in iteration folder
+            stack_file_path = os.path.join(iteration_folder, "operation_stack.txt")
+            with open(stack_file_path, "w") as f:
+                f.write(f"Target specification: {target_spec}\n")
+                f.write(f"Generated {len(operation_stack)} operations in stack\n\n")
+                f.write("Operation stack (in reverse order - dependencies first):\n")
+                for i in range(len(operation_stack) - 1, -1, -1):
+                    op = operation_stack[i]
+                    f.write(
+                        f"  {i}: {op.op_name} -> {op.output_spec} (depth {op.depth})\n"
+                    )
+
+            # Generate visualization in the iteration folder
+            visualize_operation_graph(
+                operation_graph, "Operation Graph", iteration_folder
+            )
+
+        if python_code:
+            # Write Python code to file in iteration folder
+            code_file_path = os.path.join(iteration_folder, "generated_code.py")
+            with open(code_file_path, "w") as f:
+                f.write(python_code)
+
+            print(f"📁 Code saved in : {code_file_path}")
+
+        print(f"📁 All files saved to: {iteration_folder}")
+
+    import time
+
+    try:
+        logger = logging.getLogger(__name__)
+
+        # Generate target specification first
+        logger.debug("⏱️  Step 1: Generating target spec...")
+        start_time = time.time()
+        target_spec = fuzz_spec()
+        logger.debug(
+            "   Completed in %.3fs - %s", time.time() - start_time, target_spec
+        )
+
+        logger.debug("⏱️  Step 2: Generating operation graph...")
+        start_time = time.time()
+        operation_graph = fuzz_operation_graph(
+            target_spec, max_depth=max_depth, seed=seed
+        )
+        logger.debug("⏱️  Step 3: Converting to Python code...")
+        start_time = time.time()
+        python_code = convert_graph_to_python_code(operation_graph, seed=seed)
+        logger.debug(
+            "   Completed in %.3fs - %d chars",
+            time.time() - start_time,
+            len(python_code),
+        )
+
+        logger.debug("⏱️  Step 4: Executing Python code...")
+        start_time = time.time()
+        # Enable temporary file preservation in debug mode for easier debugging
+        preserve_temp = logger.isEnabledFor(logging.DEBUG)
+        # Use a 60-second timeout for execution
+        result = execute_python_code(
+            python_code, target_spec, preserve_temp_file=preserve_temp, timeout=300
+        )
+        logger.debug("   Completed in %.3fs", time.time() - start_time)
+
+        # # Validate the result matches target specification
+        if not log_at_faluire:
+            log(True)
+        return seed, result, None
+
+    except Exception as e:
+        print(f"\n❌ Execution failed: {e}")
+        # from visualize_stack import visualize_operation_stack
+        log(False)
+        import traceback
+
+        traceback.print_exc()
+        error_message = str(e)
+        return seed, False, error_message
+
+
+if __name__ == "__main__":
+    import argparse
+
+    try:
+        from multi_process_fuzzer import run_multi_process_fuzzer
+    except ImportError:
+        # If importing as a module fails, import from the same directory
+        import os
+        import sys
+
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        sys.path.insert(0, current_dir)
+        from multi_process_fuzzer import run_multi_process_fuzzer
+
+    # Set up command-line argument parsing
+    parser = argparse.ArgumentParser(
+        description="PyTorch Fuzzer - Generate and test random PyTorch operations"
+    )
+
+    # Single seed execution arguments
+    parser.add_argument("--seed", type=int, help="Random seed for single execution")
+    parser.add_argument(
+        "--max-depth", type=int, help="Maximum depth for operation stack (1-20)"
+    )
+
+    # Multi-process fuzzing arguments
+    parser.add_argument(
+        "--start", type=int, help="Starting seed value for multi-process fuzzing"
+    )
+    parser.add_argument(
+        "--count", type=int, help="Number of seeds to run in multi-process fuzzing"
+    )
+    parser.add_argument(
+        "--processes",
+        "-p",
+        type=int,
+        help="Number of worker processes to use (default: auto-detected)",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Print detailed output for all runs (not just failures)",
+    )
+
+    # Legacy arguments
+    parser.add_argument(
+        "--single",
+        action="store_true",
+        help="Run a single fuzz_and_execute (deprecated, use --seed)",
+    )
+    parser.add_argument(
+        "--log-level",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        default="INFO",
+        help="Set the logging level (default: INFO)",
+    )
+
+    args = parser.parse_args()
+
+    # Configure logging
+    logging.basicConfig(
+        level=getattr(logging, args.log_level), format="%(levelname)s: %(message)s"
+    )
+    logger = logging.getLogger(__name__)
+
+    # Determine execution mode
+    if args.seed is not None or args.single:
+        # Single seed execution mode
+        print("Running single fuzz_and_execute...")
+        seed, success, error_message = fuzz_and_execute(
+            seed=args.seed, max_depth=args.max_depth
+        )
+        print(f"Result: seed={seed}, success={success}")
+        if not success:
+            print(f"Error: {error_message}")
+            sys.exit(1)
+    elif args.start is not None or args.count is not None:
+        # Multi-process fuzzing mode
+        if args.start is None:
+            print("❌ Error: --start is required when --count is specified")
+            sys.exit(1)
+        if args.count is None:
+            print("❌ Error: --count is required when --start is specified")
+            sys.exit(1)
+
+        # Validate arguments
+        if args.count < 1:
+            print("❌ Error: --count must be at least 1")
+            sys.exit(1)
+
+        # Default number of processes
+        if args.processes is None:
+            cpu_count = mp.cpu_count()
+            args.processes = max(1, min(16, int(cpu_count * 0.75)))
+
+        if args.processes < 1:
+            print("❌ Error: Number of processes must be at least 1")
+            sys.exit(1)
+
+        try:
+            run_multi_process_fuzzer(
+                num_processes=args.processes,
+                seed_start=args.start,
+                seed_count=args.count,
+                verbose=args.verbose,
+            )
+        except Exception as e:
+            print(f"❌ Unexpected error: {str(e)}")
+            import traceback
+
+            traceback.print_exc()
+            sys.exit(1)
+    else:
+        # Show help when no arguments are provided
+        parser.print_help()
+        print("\nExamples:")
+        print("  python fuzzer.py --seed 42                    # Run single seed")
+        print(
+            "  python fuzzer.py --start 0 --count 1000       # Run multi-process fuzzing"
+        )
+        print("  python fuzzer.py --start 100 --count 50 -p 8  # Use 8 processes")
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/image.png b/tools/experimental/dynamic_shapes/torchfuzz/image.png
new file mode 100644
index 0000000000000..ec90ab3310c98
Binary files /dev/null and b/tools/experimental/dynamic_shapes/torchfuzz/image.png differ
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/multi_process_fuzzer.py b/tools/experimental/dynamic_shapes/torchfuzz/multi_process_fuzzer.py
new file mode 100644
index 0000000000000..bf138871458d5
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/multi_process_fuzzer.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+"""
+Multi-process fuzzer library that uses worker processes to execute fuzzer.py with different seeds.
+"""
+
+import multiprocessing as mp
+import re
+import subprocess
+import sys
+import time
+
+
+try:
+    from tqdm import tqdm
+
+    HAS_TQDM = True
+except ImportError:
+    HAS_TQDM = False
+
+
+def persist_print(msg):
+    """Print messages that persist with tqdm progress bars."""
+    try:
+        if HAS_TQDM:
+            # Keep prints on the same stream as the bar
+            tqdm.write(msg, file=sys.stderr)
+        else:
+            print(msg, file=sys.stderr, flush=True)
+    except BrokenPipeError:
+        import os
+
+        os.makedirs("/tmp/torchfuzz", exist_ok=True)
+        with open("/tmp/torchfuzz/crash.log", "a") as f:
+            f.write(f"BrokenPipeError: {msg}\n")
+
+
+# List of regex patterns for ignore bucket
+IGNORE_PATTERNS: list[re.Pattern] = [
+    re.compile(
+        r"Dynamo failed to run FX node with fake tensors: call_method fill_diagonal_"
+    ),  # https://github.com/pytorch/pytorch/issues/163420
+    re.compile(
+        r"TypeError: unsupported operand type\(s\) for divmod\(\): 'SymInt' and 'int'"
+    ),  # https://github.com/pytorch/pytorch/issues/163457
+    re.compile(
+        r"RuntimeError: self\.stride\(-1\) must be 1 to view ComplexDouble as"
+    ),  # https://github.com/pytorch/pytorch/issues/162561
+    re.compile(
+        r"BooleanAtom not allowed in this context"
+    ),  # https://github.com/pytorch/pytorch/issues/160726
+    # Add more patterns here as needed, e.g.:
+    # re.compile(r"Some other error message"),
+]
+
+
+def is_ignored_output(output: str) -> int:
+    """
+    Check if the output matches any ignore pattern.
+
+    Args:
+        output: The combined stdout/stderr string.
+
+    Returns:
+        Index of the matched ignore pattern, or -1 if none matched.
+    """
+    for idx, pattern in enumerate(IGNORE_PATTERNS):
+        if pattern.search(output):
+            return idx
+    return -1
+
+
+def run_fuzzer_with_seed(seed: int) -> tuple[int, bool, str, float, int]:
+    """
+    Run fuzzer.py with a specific seed.
+
+    Args:
+        seed: The seed value to pass to fuzzer.py
+
+    Returns:
+        Tuple of (seed, success, output, duration, ignored_pattern_idx)
+        ignored_pattern_idx: -1 if not ignored, otherwise index of IGNORE_PATTERNS
+    """
+    start_time = time.time()
+
+    try:
+        # Run fuzzer.py with the specified seed
+        cmd = [sys.executable, "fuzzer.py", "--single", "--seed", str(seed)]
+
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300,  # 5 minute timeout per seed
+        )
+
+        duration = time.time() - start_time
+        success = result.returncode == 0
+
+        # Combine stdout and stderr for output
+        output = ""
+        if result.stdout:
+            output += f"STDOUT:\n{result.stdout}\n"
+        if result.stderr:
+            output += f"STDERR:\n{result.stderr}\n"
+        output += f"Return code: {result.returncode}"
+
+        # Check if output should be ignored and which pattern matched
+        ignored_pattern_idx = is_ignored_output(output)
+        if ignored_pattern_idx != -1:
+            # Mark as ignored (could also return a special flag if needed)
+            output = "[IGNORED] " + output
+
+        return seed, success, output, duration, ignored_pattern_idx
+
+    except subprocess.TimeoutExpired:
+        duration = time.time() - start_time
+        return seed, False, "Process timed out after 300 seconds", duration, -1
+
+    except Exception as e:
+        duration = time.time() - start_time
+        return seed, False, f"Exception occurred: {str(e)}", duration, -1
+
+
+def run_multi_process_fuzzer(
+    num_processes: int = 2,
+    seed_start: int = 1,
+    seed_count: int = 10,
+    verbose: bool = False,
+) -> None:
+    """
+    Run the multi-process fuzzer.
+
+    Args:
+        num_processes: Number of worker processes to use
+        seed_start: Starting seed value (inclusive)
+        seed_count: Number of seeds to run
+        verbose: Whether to print detailed output
+    """
+    seeds = list(range(seed_start, seed_start + seed_count))
+
+    persist_print(f"🚀 Starting multi-process fuzzer with {num_processes} processes")
+    persist_print(
+        f"📊 Processing seeds {seed_start} to {seed_start + seed_count - 1} ({len(seeds)} total)"
+    )
+    persist_print("🔧 Command template: python fuzzer.py --seed {seed}")
+    persist_print("=" * 60)
+
+    start_time = time.time()
+    results = []
+    successful_count = 0
+    failed_count = 0
+    ignored_count = 0
+    ignored_seeds = []
+    ignored_pattern_counts: dict[int, int] = dict.fromkeys(
+        range(len(IGNORE_PATTERNS)), 0
+    )
+
+    try:
+        # Use multiprocessing Pool to distribute work
+        with mp.Pool(processes=num_processes) as pool:
+            # Submit all seeds to the process pool
+            future_results = []
+            for seed in seeds:
+                future = pool.apply_async(run_fuzzer_with_seed, (seed,))
+                future_results.append(future)
+
+            # Set up progress bar
+            if HAS_TQDM:
+                pbar = tqdm(
+                    total=len(seeds),
+                    desc="Processing seeds",
+                    file=sys.stdout,
+                    bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}] ✅/❌/❓={postfix}",
+                    dynamic_ncols=True,
+                )
+                pbar.set_postfix_str(
+                    f"{successful_count}/{failed_count}/{ignored_count} | throughput: 0.00 seeds/hr"
+                )
+            else:
+                persist_print("Progress: (install tqdm for better progress bar)")
+                pbar = None
+
+            # Collect results as they complete
+            for i, future in enumerate(future_results):
+                try:
+                    seed, success, output, duration, ignored_pattern_idx = future.get()
+                    results.append(
+                        (seed, success, output, duration, ignored_pattern_idx)
+                    )
+
+                    if ignored_pattern_idx != -1:
+                        ignored_seeds.append(seed)
+                        ignored_pattern_counts[ignored_pattern_idx] += 1
+                        ignored_count += 1
+
+                    # Only increment failed_count if not ignored
+                    if success:
+                        successful_count += 1
+                    elif ignored_pattern_idx == -1:
+                        failed_count += 1
+
+                    elapsed = time.time() - start_time
+                    throughput = (i + 1) / (elapsed / 3600)
+
+                    # Update progress bar
+                    if HAS_TQDM and pbar:
+                        pbar.set_postfix_str(
+                            f"{successful_count}/{failed_count}/{ignored_count} | throughput: {throughput:.2f} seeds/hr"
+                        )
+                        # tqdm automatically shows ETA (estimated time remaining) in the bar_format above
+                        pbar.update(1)
+                    else:
+                        status_emoji = "✅" if success else "❌"
+                        ignored_text = " (IGNORED)" if ignored_pattern_idx != -1 else ""
+                        persist_print(
+                            f"Completed {i + 1}/{len(seeds)} - Seed {seed}: {status_emoji}{ignored_text}"
+                        )
+
+                    # Only show detailed output for failures (unless verbose)
+                    if not success and ignored_pattern_idx == -1:
+                        if HAS_TQDM and pbar:
+                            pbar.write(
+                                f"❌ FAILURE - Seed {seed} (duration: {duration:.2f}s):"
+                            )
+                            for line in output.split("\n"):
+                                if line.strip():
+                                    pbar.write(f"   {line}")
+                            pbar.write("")  # Empty line
+                        else:
+                            persist_print(
+                                f"❌ FAILURE - Seed {seed} (duration: {duration:.2f}s):"
+                            )
+                            for line in output.split("\n"):
+                                if line.strip():
+                                    persist_print(f"   {line}")
+                            persist_print("")
+                    elif not success and ignored_pattern_idx != -1:
+                        # Optionally, print ignored failures if desired
+                        if verbose:
+                            if HAS_TQDM and pbar:
+                                pbar.write(
+                                    f"🚫 IGNORED - Seed {seed} (duration: {duration:.2f}s):"
+                                )
+                                for line in output.split("\n"):
+                                    if line.strip():
+                                        pbar.write(f"   {line}")
+                                pbar.write("")
+                            else:
+                                persist_print(
+                                    f"🚫 IGNORED - Seed {seed} (duration: {duration:.2f}s):"
+                                )
+                                for line in output.split("\n"):
+                                    if line.strip():
+                                        persist_print(f"   {line}")
+                                persist_print("")
+                    elif verbose:
+                        if HAS_TQDM and pbar:
+                            ignored_text = (
+                                " [IGNORED]" if ignored_pattern_idx != -1 else ""
+                            )
+                            pbar.write(
+                                f"✅ SUCCESS - Seed {seed} (duration: {duration:.2f}s){ignored_text}"
+                            )
+                            if output.strip():
+                                for line in output.split("\n"):
+                                    if line.strip():
+                                        pbar.write(f"   {line}")
+                                pbar.write("")
+                        else:
+                            ignored_text = (
+                                " [IGNORED]" if ignored_pattern_idx != -1 else ""
+                            )
+                            persist_print(
+                                f"✅ SUCCESS - Seed {seed} (duration: {duration:.2f}s){ignored_text}"
+                            )
+                            if output.strip():
+                                for line in output.split("\n"):
+                                    if line.strip():
+                                        persist_print(f"   {line}")
+                                persist_print("")
+
+                except Exception as e:
+                    failed_count += 1
+                    if HAS_TQDM and pbar:
+                        pbar.set_postfix_str(f"{successful_count}/{failed_count}")
+                        pbar.update(1)
+                        pbar.write(f"❌ POOL ERROR - Seed {seeds[i]}: {str(e)}")
+                    else:
+                        persist_print(
+                            f"Completed {i + 1}/{len(seeds)} - Seed {seeds[i]}: ❌ POOL ERROR"
+                        )
+                        persist_print(f"❌ POOL ERROR - Seed {seeds[i]}: {str(e)}")
+                    results.append((seeds[i], False, f"Pool error: {str(e)}", 0.0, -1))
+
+            # Close progress bar
+            if HAS_TQDM and pbar:
+                pbar.close()
+    except KeyboardInterrupt:
+        persist_print("\n🛑 Interrupted by user (Ctrl+C)")
+        # Print summary up to this point
+        total_time = time.time() - start_time
+        persist_print("=" * 60)
+        persist_print("📈 SUMMARY (partial, interrupted)")
+        persist_print("=" * 60)
+
+        successful = [r for r in results if r[1]]
+        # Only count as failed if not ignored
+        failed = [r for r in results if not r[1] and r[4] == -1]
+        ignored = [r for r in results if r[4] != -1]
+
+        persist_print(
+            f"✅ Successful: {len(successful)}/{len(results)} ({(len(successful) / len(results) * 100 if results else 0):.1f}%)"
+        )
+        persist_print(
+            f"❌ Failed:     {len(failed)}/{len(results)} ({(len(failed) / len(results) * 100 if results else 0):.1f}%)"
+        )
+        persist_print(f"⏱️  Total time: {total_time:.2f}s")
+        if results:
+            persist_print(
+                f"⚡ Throughput: {(len(results) / (total_time / 3600)):.2f} seeds/hr"
+                if total_time > 0
+                else "⚡ Throughput: N/A"
+            )
+        if failed:
+            persist_print(f"\n❌ Failed seeds: {[r[0] for r in failed]}")
+        if successful:
+            persist_print(f"✅ Successful seeds: {[r[0] for r in successful]}")
+            avg_success_time = sum(r[3] for r in successful) / len(successful)
+            persist_print(f"⚡ Avg time for successful runs: {avg_success_time:.2f}s")
+        if ignored:
+            persist_print(f"\n🚫 Ignored seeds: {[r[0] for r in ignored]}")
+            # Print ignore pattern stats
+            persist_print("\n🚫 Ignored pattern statistics:")
+            total_ignored = len(ignored)
+            for idx, pattern in enumerate(IGNORE_PATTERNS):
+                count = ignored_pattern_counts[idx]
+                percent = (count / total_ignored * 100) if total_ignored else 0
+                persist_print(
+                    f"  Pattern {idx}: {pattern.pattern!r} - {count} ({percent:.1f}%)"
+                )
+
+        sys.exit(130)
+
+    total_time = time.time() - start_time
+
+    # Print summary
+    persist_print("=" * 60)
+    persist_print("📈 SUMMARY")
+    persist_print("=" * 60)
+
+    successful = [r for r in results if r[1]]
+    # Only count as failed if not ignored
+    failed = [r for r in results if not r[1] and r[4] == -1]
+    ignored = [r for r in results if r[4] != -1]
+
+    persist_print(
+        f"✅ Successful: {len(successful)}/{len(results)} ({len(successful) / len(results) * 100:.1f}%)"
+    )
+    persist_print(
+        f"❌ Failed:     {len(failed)}/{len(results)} ({len(failed) / len(results) * 100:.1f}%)"
+    )
+    persist_print(f"⏱️  Total time: {total_time:.2f}s")
+    persist_print(
+        f"⚡ Throughput: {(len(results) / (total_time / 3600)):.2f} seeds/hr"
+        if total_time > 0
+        else "⚡ Throughput: N/A"
+    )
+
+    if failed:
+        persist_print(f"\n❌ Failed seeds: {[r[0] for r in failed]}")
+
+    if successful:
+        persist_print(f"✅ Successful seeds: {[r[0] for r in successful]}")
+        avg_success_time = sum(r[3] for r in successful) / len(successful)
+        persist_print(f"⚡ Avg time for successful runs: {avg_success_time:.2f}s")
+
+    if ignored:
+        persist_print(f"\n🚫 Ignored seeds: {[r[0] for r in ignored]}")
+        # Print ignore pattern stats
+        persist_print("\n🚫 Ignored pattern statistics:")
+        total_ignored = len(ignored)
+        for idx, pattern in enumerate(IGNORE_PATTERNS):
+            count = ignored_pattern_counts[idx]
+            percent = (count / total_ignored * 100) if total_ignored else 0
+            persist_print(
+                f"  Pattern {idx}: {pattern.pattern!r} - {count} ({percent:.1f}%)"
+            )
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/operation_stack.png b/tools/experimental/dynamic_shapes/torchfuzz/operation_stack.png
new file mode 100644
index 0000000000000..b5dfcaf110e29
Binary files /dev/null and b/tools/experimental/dynamic_shapes/torchfuzz/operation_stack.png differ
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/operators/__init__.py b/tools/experimental/dynamic_shapes/torchfuzz/operators/__init__.py
new file mode 100644
index 0000000000000..c1680ac638877
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/operators/__init__.py
@@ -0,0 +1,22 @@
+"""Torchfuzz operators module."""
+
+from torchfuzz.operators.arg import ArgOperator
+from torchfuzz.operators.base import Operator
+from torchfuzz.operators.constant import ConstantOperator
+from torchfuzz.operators.item import ItemOperator
+from torchfuzz.operators.registry import get_operator, list_operators, register_operator
+from torchfuzz.operators.scalar_pointwise import ScalarPointwiseOperator
+from torchfuzz.operators.tensor_pointwise import TensorPointwiseOperator
+
+
+__all__ = [
+    "Operator",
+    "TensorPointwiseOperator",
+    "ScalarPointwiseOperator",
+    "ItemOperator",
+    "ConstantOperator",
+    "ArgOperator",
+    "get_operator",
+    "register_operator",
+    "list_operators",
+]
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/operators/arg.py b/tools/experimental/dynamic_shapes/torchfuzz/operators/arg.py
new file mode 100644
index 0000000000000..a342852fa206f
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/operators/arg.py
@@ -0,0 +1,27 @@
+"""Arg operator implementation."""
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import Spec
+
+
+class ArgOperator(Operator):
+    """Operator for function arguments/parameters."""
+
+    def __init__(self):
+        super().__init__("arg")
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Arg can produce any type of output."""
+        return True
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Arg requires no inputs for fuzzing."""
+        return []
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for arg operation."""
+        # The actual argument name assignment will be handled separately
+        # in the codegen.py when processing arg operations
+        return f"# {output_name} will be assigned an argument value"
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/operators/base.py b/tools/experimental/dynamic_shapes/torchfuzz/operators/base.py
new file mode 100644
index 0000000000000..d5f4b6cf4765c
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/operators/base.py
@@ -0,0 +1,38 @@
+"""Base operator implementation."""
+
+from abc import ABC, abstractmethod
+
+from torchfuzz.tensor_fuzzer import Spec
+
+
+class Operator(ABC):
+    """Base class for all operators in torchfuzz."""
+
+    def __init__(self, name: str):
+        """Initialize operator with name."""
+        self.name = name
+
+    @abstractmethod
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Check if this operator can produce the given output spec."""
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """
+        Get input specifications for fuzzing. By default, delegates to decompose.
+        Leaf operators should override this to return an empty list.
+        """
+        return self.decompose(output_spec)
+
+    @abstractmethod
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for this operation."""
+
+    def __str__(self) -> str:
+        """String representation of the operator."""
+        return f"{self.__class__.__name__}({self.name})"
+
+    def __repr__(self) -> str:
+        """Repr representation of the operator."""
+        return self.__str__()
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/operators/constant.py b/tools/experimental/dynamic_shapes/torchfuzz/operators/constant.py
new file mode 100644
index 0000000000000..1d6a1bf5dc5bf
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/operators/constant.py
@@ -0,0 +1,87 @@
+"""Constant operator implementation."""
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import (
+    fuzz_scalar,
+    fuzz_tensor_simple,
+    ScalarSpec,
+    Spec,
+    TensorSpec,
+)
+
+
+class ConstantOperator(Operator):
+    """Operator for generating constants."""
+
+    def __init__(self):
+        super().__init__("constant")
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Constant can produce any type of output."""
+        return True
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Constant requires no inputs for fuzzing."""
+        return []
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for constant creation."""
+        # Create constant by calling fuzzing functions during codegen with deterministic seed
+        # Use a deterministic seed based on the variable name to ensure reproducibility
+        var_seed = hash(output_name) % (2**31)
+
+        if isinstance(output_spec, ScalarSpec):
+            # Call fuzz_scalar during codegen and embed the result
+            actual_value = fuzz_scalar(output_spec, seed=var_seed)
+
+            # Format the value for embedding in code
+            if isinstance(actual_value, bool):
+                value_str = str(actual_value)
+            elif isinstance(actual_value, (int, float)):
+                value_str = repr(actual_value)
+            elif isinstance(actual_value, complex):
+                value_str = f"complex({actual_value.real}, {actual_value.imag})"
+            else:
+                value_str = repr(actual_value)
+
+            return f"{output_name} = {value_str}"
+
+        elif isinstance(output_spec, TensorSpec):
+            # Call fuzz_tensor_simple during codegen and embed the result
+            actual_tensor = fuzz_tensor_simple(
+                output_spec.size, output_spec.stride, output_spec.dtype, seed=var_seed
+            )
+
+            # Convert tensor to code representation
+            size_str = str(output_spec.size)
+            dtype_str = f"torch.{output_spec.dtype}".replace("torch.torch.", "torch.")
+
+            # Handle empty tensors (with 0 elements)
+            if actual_tensor.numel() == 0:
+                # For empty tensors, use a default fill value based on dtype
+                import torch
+
+                default_values = {
+                    torch.float16: 0.0,
+                    torch.float32: 0.0,
+                    torch.float64: 0.0,
+                    torch.bfloat16: 0.0,
+                    torch.int8: 0,
+                    torch.int16: 0,
+                    torch.int32: 0,
+                    torch.int64: 0,
+                    torch.bool: False,
+                    torch.complex64: 0.0,
+                    torch.complex128: 0.0,
+                }
+                fill_value = default_values.get(output_spec.dtype, 0)
+                return f"{output_name} = torch.full({size_str}, {fill_value}, dtype={dtype_str})"
+            else:
+                # For non-empty tensors, use the first element as fill value
+                fill_value = actual_tensor.flatten()[0].item()
+                return f"{output_name} = torch.full({size_str}, {fill_value}, dtype={dtype_str})"
+
+        else:
+            return f"# Unknown output spec type for constant: {type(output_spec)}"
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/operators/item.py b/tools/experimental/dynamic_shapes/torchfuzz/operators/item.py
new file mode 100644
index 0000000000000..a3b02b58d9ec8
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/operators/item.py
@@ -0,0 +1,35 @@
+"""Item operator implementation."""
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import ScalarSpec, Spec, TensorSpec
+
+
+class ItemOperator(Operator):
+    """Operator for extracting a scalar from a tensor."""
+
+    def __init__(self):
+        super().__init__("torch.ops.aten.item")
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Item can only produce scalars."""
+        return isinstance(output_spec, ScalarSpec)
+
+    def fuzz_inputs_specs(self, output_spec: Spec, num_inputs: int = 1) -> list[Spec]:
+        """Decompose scalar into a single-element tensor for item operation."""
+        if not isinstance(output_spec, ScalarSpec):
+            raise ValueError("ItemOperator can only produce ScalarSpec outputs")
+
+        # Create a tensor spec that can produce a scalar via .item()
+        # Use a 1-D tensor with 1 element
+        tensor_spec = TensorSpec(size=(1,), stride=(1,), dtype=output_spec.dtype)
+
+        return [tensor_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for item operation."""
+        if len(input_names) != 1:
+            raise ValueError("ItemOperator requires exactly one input")
+
+        return f"{output_name} = {input_names[0]}.item()"
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/operators/registry.py b/tools/experimental/dynamic_shapes/torchfuzz/operators/registry.py
new file mode 100644
index 0000000000000..2d90abce0b526
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/operators/registry.py
@@ -0,0 +1,61 @@
+"""Operator registry for mapping operation names to operator instances."""
+
+from typing import Optional
+
+from torchfuzz.operators.arg import ArgOperator
+from torchfuzz.operators.base import Operator
+from torchfuzz.operators.constant import ConstantOperator
+from torchfuzz.operators.item import ItemOperator
+from torchfuzz.operators.scalar_pointwise import ScalarPointwiseOperator
+from torchfuzz.operators.tensor_pointwise import TensorPointwiseOperator
+
+
+class OperatorRegistry:
+    """Registry for managing operator instances."""
+
+    def __init__(self):
+        """Initialize the registry with default operators."""
+        self._operators: dict[str, Operator] = {}
+        self._register_default_operators()
+
+    def _register_default_operators(self):
+        """Register the default set of operators."""
+        self.register(TensorPointwiseOperator())
+        self.register(ScalarPointwiseOperator())
+        self.register(ItemOperator())
+        self.register(ConstantOperator())
+        self.register(ArgOperator())
+
+    def register(self, operator: Operator):
+        """Register an operator in the registry."""
+        self._operators[operator.name] = operator
+
+    def get(self, op_name: str) -> Optional[Operator]:
+        """Get an operator by name."""
+        # Handle special arg_ operations by mapping them to the ArgOperator
+        if op_name.startswith("arg_"):
+            return self._operators.get("arg")
+        return self._operators.get(op_name)
+
+    def list_operators(self) -> dict[str, Operator]:
+        """List all registered operators."""
+        return self._operators.copy()
+
+
+# Global registry instance
+_global_registry = OperatorRegistry()
+
+
+def get_operator(op_name: str) -> Optional[Operator]:
+    """Get an operator from the global registry."""
+    return _global_registry.get(op_name)
+
+
+def register_operator(operator: Operator):
+    """Register an operator in the global registry."""
+    _global_registry.register(operator)
+
+
+def list_operators() -> dict[str, Operator]:
+    """List all operators in the global registry."""
+    return _global_registry.list_operators()
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/operators/scalar_pointwise.py b/tools/experimental/dynamic_shapes/torchfuzz/operators/scalar_pointwise.py
new file mode 100644
index 0000000000000..ffc1dca7fc65f
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/operators/scalar_pointwise.py
@@ -0,0 +1,44 @@
+"""Scalar pointwise operator implementation."""
+
+import random
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import ScalarSpec, Spec
+
+
+class ScalarPointwiseOperator(Operator):
+    """Operator for pointwise operations on scalars (add, mul, sub, div)."""
+
+    def __init__(self):
+        super().__init__("scalar_pointwise")
+        self.operations = ["+", "*", "-", "/"]
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Scalar pointwise operations can only produce scalars."""
+        return isinstance(output_spec, ScalarSpec)
+
+    def fuzz_inputs_specs(self, output_spec: Spec, num_inputs: int = 2) -> list[Spec]:
+        """Decompose scalar into input scalars for pointwise operation with type promotion."""
+        if not isinstance(output_spec, ScalarSpec):
+            raise ValueError(
+                "ScalarPointwiseOperator can only produce ScalarSpec outputs"
+            )
+
+        # Use shared type promotion utility
+        from torchfuzz.type_promotion import get_scalar_promotion_pairs
+
+        supported_types = get_scalar_promotion_pairs(output_spec.dtype)
+        dtypes = random.choice(supported_types)
+
+        return [ScalarSpec(dtype=dtypes[0]), ScalarSpec(dtype=dtypes[1])]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for scalar pointwise operation."""
+        if len(input_names) != 2:
+            raise ValueError("ScalarPointwiseOperator requires exactly two inputs")
+
+        # Randomly choose an operation
+        op = random.choice(self.operations)
+        return f"{output_name} = {input_names[0]} {op} {input_names[1]}"
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/operators/tensor_pointwise.py b/tools/experimental/dynamic_shapes/torchfuzz/operators/tensor_pointwise.py
new file mode 100644
index 0000000000000..499a8abc54a4c
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/operators/tensor_pointwise.py
@@ -0,0 +1,91 @@
+"""Tensor pointwise operator implementation."""
+
+import random
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import Spec, TensorSpec
+from torchfuzz.type_promotion import (
+    get_dtype_map,
+    get_dtype_name,
+    get_promotion_table_for_strings,
+)
+
+
+class TensorPointwiseOperator(Operator):
+    """Operator for element-wise pointwise operations (add, mul, sub, div)."""
+
+    def __init__(self):
+        super().__init__("tensor_pointwise")
+        self.operations = {
+            "add": {
+                "torch_op": "torch.ops.aten.add",
+                "symbol": "+",
+            },
+            "mul": {
+                "torch_op": "torch.ops.aten.mul",
+                "symbol": "*",
+            },
+            "sub": {
+                "torch_op": "torch.ops.aten.sub",
+                "symbol": "-",
+            },
+            "div": {
+                "torch_op": "torch.ops.aten.div",
+                "symbol": "/",
+            },
+        }
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Tensor pointwise operations can produce tensors but not scalars."""
+        return isinstance(output_spec, TensorSpec)
+
+    def fuzz_inputs_specs(self, output_spec: Spec, num_inputs: int = 2) -> list[Spec]:
+        """Decompose tensor into input tensors for pointwise operation with type promotion."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError(
+                "TensorPointwiseOperator can only produce TensorSpec outputs"
+            )
+
+        # Use shared type promotion table
+        promotion_table = get_promotion_table_for_strings()
+
+        # If num_inputs > 2, promote left-to-right (e.g. (((a op b) op c) op d))
+        # For simplicity, we generate the first two with promotion, rest match output dtype
+        dtype_str = get_dtype_name(output_spec.dtype)
+        supported_types = promotion_table.get(dtype_str, [(dtype_str, dtype_str)])
+
+        # Pick a random promotion pattern for the first two inputs
+        if num_inputs >= 2:
+            dtypes = list(random.choice(supported_types))
+            # For >2 inputs, fill with output dtype
+            while len(dtypes) < num_inputs:
+                dtypes.append(dtype_str)
+        else:
+            dtypes = [dtype_str] * num_inputs
+
+        # Convert dtype strings back to torch dtypes
+        dtype_map = get_dtype_map()
+
+        return [
+            TensorSpec(
+                size=output_spec.size,
+                stride=output_spec.stride,
+                dtype=dtype_map.get(dt, output_spec.dtype),
+            )
+            for dt in dtypes
+        ]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for pointwise operation."""
+        # Randomly choose an operation
+        op_name = random.choice(list(self.operations.keys()))
+        op_info = self.operations[op_name]
+
+        if len(input_names) == 2:
+            return f"{output_name} = {op_info['torch_op']}({input_names[0]}, {input_names[1]})"
+        else:
+            # Chain operations using symbols for readability
+            expr = f" {op_info['symbol']} ".join(input_names)
+            return f"{output_name} = {expr}"
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/ops_fuzzer.py b/tools/experimental/dynamic_shapes/torchfuzz/ops_fuzzer.py
new file mode 100644
index 0000000000000..3c81c24854d15
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/ops_fuzzer.py
@@ -0,0 +1,359 @@
+# mypy: ignore-errors
+
+import random
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from torchfuzz.operators import get_operator, list_operators
+from torchfuzz.tensor_fuzzer import (
+    fuzz_tensor_size,
+    fuzz_torch_tensor_type,
+    fuzz_valid_stride,
+    ScalarSpec,
+    Spec,
+    specs_compatible,
+    TensorSpec,
+)
+
+
+# Cache operators at module level to avoid repeated calls to list_operators()
+_CACHED_OPERATORS = None
+
+
+def _get_cached_operators():
+    """Get cached operators, initializing if necessary."""
+    global _CACHED_OPERATORS
+    if _CACHED_OPERATORS is None:
+        _CACHED_OPERATORS = list_operators()
+    return _CACHED_OPERATORS
+
+
+@dataclass
+class OperationNode:
+    """
+    Represents a node in the operation graph.
+
+    Attributes:
+        node_id: Unique identifier for this node
+        op_name: Name of the operation (e.g., 'torch.ops.aten.add', 'scalar_add', 'arg')
+        input_specs: List of input specifications required by this operation
+        output_spec: Output specification produced by this operation
+        input_nodes: List of node IDs that provide inputs to this operation
+        depth: Depth level of this node in the generation tree
+    """
+
+    node_id: str
+    op_name: str
+    input_specs: list[Spec]
+    output_spec: Spec
+    input_nodes: list[str]
+    depth: int
+
+    def __str__(self) -> str:
+        """String representation for debugging."""
+        return (
+            f"{self.node_id}: {self.op_name} -> {self.output_spec} (depth {self.depth})"
+        )
+
+    def __repr__(self) -> str:
+        """Detailed representation for debugging."""
+        return (
+            f"OperationNode(node_id='{self.node_id}', op_name='{self.op_name}', "
+            f"input_specs={self.input_specs}, output_spec={self.output_spec}, "
+            f"input_nodes={self.input_nodes}, depth={self.depth})"
+        )
+
+
+@dataclass
+class OperationGraph:
+    """
+    Represents a graph of operations.
+
+    Attributes:
+        nodes: Dictionary mapping node_id to OperationNode
+        root_node_id: ID of the root node that produces the final output (the output node)
+        target_spec: The specification that the root node should produce
+    """
+
+    nodes: dict[str, OperationNode]
+    root_node_id: str  # The output node - produces the final result of the graph
+    target_spec: Spec
+
+    def __post_init__(self):
+        """Validate the graph structure after initialization."""
+        if self.root_node_id not in self.nodes:
+            raise ValueError(f"Root node {self.root_node_id} not found in nodes")
+
+    def get_topological_order(self) -> list[str]:
+        """
+        Get nodes in topological order (dependencies before dependents).
+
+        Returns:
+            List of node IDs in topological order
+        """
+        visited = set()
+        temp_visited = set()
+        result = []
+
+        def visit(node_id: str):
+            if node_id in temp_visited:
+                raise ValueError(f"Cycle detected involving node {node_id}")
+            if node_id in visited:
+                return
+
+            temp_visited.add(node_id)
+            node = self.nodes[node_id]
+
+            # Visit all input nodes first
+            for input_node_id in node.input_nodes:
+                if input_node_id in self.nodes:  # Skip external inputs
+                    visit(input_node_id)
+
+            temp_visited.remove(node_id)
+            visited.add(node_id)
+            result.append(node_id)
+
+        # Start from all nodes to handle disconnected components
+        for node_id in self.nodes:
+            if node_id not in visited:
+                visit(node_id)
+
+        return result
+
+    def get_leaf_nodes(self) -> list[str]:
+        """Get all leaf nodes (nodes with no inputs)."""
+        return [node_id for node_id, node in self.nodes.items() if not node.input_nodes]
+
+    def get_node_dependencies(self, node_id: str) -> list[str]:
+        """Get all nodes that this node depends on (transitive closure)."""
+        visited = set()
+        dependencies = []
+
+        def collect_deps(current_id: str):
+            if current_id in visited or current_id not in self.nodes:
+                return
+            visited.add(current_id)
+
+            node = self.nodes[current_id]
+            for input_node_id in node.input_nodes:
+                dependencies.append(input_node_id)
+                collect_deps(input_node_id)
+
+        collect_deps(node_id)
+        return dependencies
+
+    def __str__(self) -> str:
+        """String representation for debugging."""
+        lines = [
+            f"OperationGraph (root: {self.root_node_id}, target: {self.target_spec})"
+        ]
+        for node_id in self.get_topological_order():
+            node = self.nodes[node_id]
+            inputs_str = f" <- {node.input_nodes}" if node.input_nodes else ""
+            lines.append(f"  {node}{inputs_str}")
+        return "\n".join(lines)
+
+
+def fuzz_spec() -> Spec:
+    """
+    Generate a random Spec (either TensorSpec or ScalarSpec) using tensor fuzzing functions.
+
+    Utilizes:
+    - fuzz_torch_tensor_type() for random dtype
+    - fuzz_tensor_size() for random tensor size
+    - fuzz_valid_stride() for random valid strides
+
+    Returns:
+        Spec: Either a TensorSpec (80% probability) or ScalarSpec (20% probability) with random properties
+    """
+    # Get random dtype
+    dtype = fuzz_torch_tensor_type()
+
+    # 20% probability of returning ScalarSpec
+    if random.random() < 0.2:
+        return ScalarSpec(dtype=dtype)
+
+    # 80% probability of returning TensorSpec
+    # Get random size and corresponding stride
+    size = fuzz_tensor_size()
+    stride = fuzz_valid_stride(size)
+    return TensorSpec(size=size, stride=stride, dtype=dtype)
+
+
+def fuzz_op(target_spec: Spec, depth, stack_size) -> tuple[str, list[Spec]]:
+    """
+    Given an output specification, returns an operation that can
+    produce a tensor with that layout using the operator class system.
+
+    Args:
+        target_spec: Desired output specification (TensorSpec or ScalarSpec)
+        depth: Maximum depth for operation generation. At depth 0, only leaf operations
+               (constant, arg) are allowed. Higher depths allow more complex operations.
+        stack_size: Current stack size. When < 10, reduces probability of leaf operations.
+
+    Returns:
+        Tuple of (operation_name, list_of_argument_specs) where each argument spec
+        describes the layout requirements for the operation's inputs
+    """
+    # Get all available operators (cached)
+    available_operators = _get_cached_operators()
+
+    # Filter operators that can produce the target spec
+    compatible_ops = []
+    for op_name, operator in available_operators.items():
+        if operator.can_produce(target_spec):
+            compatible_ops.append((op_name, operator))
+
+    random.shuffle(compatible_ops)
+
+    if not compatible_ops:
+        raise ValueError(f"No operators available that can produce {target_spec}")
+
+    # Categorize operators into leaf and non-leaf
+    leaf_ops = []
+    non_leaf_ops = []
+
+    for op_name, operator in compatible_ops:
+        if op_name in ["constant", "arg"] or op_name.startswith("arg_"):
+            leaf_ops.append((op_name, operator))
+        else:
+            non_leaf_ops.append((op_name, operator))
+
+    # Choose operation based on depth and stack size constraints
+    if depth == 0:
+        # At depth 0, only allow leaf operations
+        if not leaf_ops:
+            # If no leaf ops can produce this spec, fallback to arg
+            return _get_arg_args_specs(target_spec)
+        chosen_op_name, chosen_operator = random.choice(leaf_ops)
+    else:
+        # At higher depths, choose between leaf and non-leaf operations
+        # Reduce probability of leaf operations when stack_size < 10
+        if (stack_size < 10 or depth > 7) and non_leaf_ops:
+            # 80% chance of non-leaf, 20% chance of leaf
+            if random.random() < 0.8:
+                chosen_op_name, chosen_operator = random.choice(non_leaf_ops)
+            else:
+                chosen_op_name, chosen_operator = (
+                    random.choice(leaf_ops) if leaf_ops else random.choice(non_leaf_ops)
+                )
+        else:
+            # Normal probability distribution
+            all_ops = non_leaf_ops + leaf_ops
+            chosen_op_name, chosen_operator = (
+                random.choice(all_ops) if all_ops else ("arg", get_operator("arg"))
+            )
+
+    if chosen_operator is None:
+        # If no operator found, fallback to arg
+        return _get_arg_args_specs(target_spec)
+
+    input_specs = chosen_operator.fuzz_inputs_specs(target_spec)
+    return chosen_op_name, input_specs
+
+
+# Global counter for generating unique argument IDs
+_next_arg_id = 0
+
+
+def _get_arg_args_specs(target_spec: Spec) -> tuple[str, list[Spec]]:
+    """Get argument specifications for arg operation."""
+    global _next_arg_id
+
+    # Generate a unique argument ID
+    arg_id = _next_arg_id
+    _next_arg_id += 1
+
+    # Return the operation name with the arg_id embedded and no input specs
+    return f"arg_{arg_id}", []
+
+
+def fuzz_operation_graph(
+    target_spec: Spec,
+    max_depth: int = 7,
+    seed: Optional[int] = None,
+) -> OperationGraph:
+    """
+    Generate a graph of operations that produces the target specification.
+
+    The graph-based approach allows for better visualization, debugging, and
+    potential optimizations like common subexpression elimination.
+
+    Args:
+        target_spec: The desired output specification (TensorSpec or ScalarSpec)
+        max_depth: Maximum depth of operations. At depth 0, only leaf operations (constant, arg) are used.
+        seed: Random seed for reproducible generation. If None, uses current random state.
+
+    Returns:
+        OperationGraph with nodes organized in a DAG structure
+    """
+
+    # Set seed for reproducible generation
+    if seed is not None:
+        import random
+
+        random.seed(seed)
+        torch.manual_seed(seed)
+
+    # Global counter for unique node IDs
+    node_counter = 0
+
+    # Dictionary to store all nodes: node_id -> OperationNode
+    nodes: dict[str, OperationNode] = {}
+
+    def _generate_node(spec: Spec, depth: int, stack_size: int = 0) -> str:
+        """
+        Generate a node for the given spec and return its node_id.
+        """
+        nonlocal node_counter
+
+        # Generate new operation
+        op_name, input_specs = fuzz_op(spec, depth, stack_size)
+
+        # Create unique node ID
+        node_id = f"node_{node_counter}"
+        node_counter += 1
+
+        # Generate input nodes
+        input_node_ids = []
+        if input_specs:  # Non-leaf operations
+            for input_spec in input_specs:
+                input_node_id = _generate_node(
+                    input_spec, max(0, depth - 1), stack_size + len(input_node_ids) + 1
+                )
+                input_node_ids.append(input_node_id)
+
+        # Create the operation node
+        node = OperationNode(
+            node_id=node_id,
+            op_name=op_name,
+            input_specs=input_specs,
+            output_spec=spec,
+            input_nodes=input_node_ids,
+            depth=depth,
+        )
+
+        # Store the node
+        nodes[node_id] = node
+
+        return node_id
+
+    # Generate the root node
+    root_node_id = _generate_node(target_spec, max_depth, 0)
+
+    # Create and return the operation graph
+    graph = OperationGraph(
+        nodes=nodes, root_node_id=root_node_id, target_spec=target_spec
+    )
+
+    # Verify that the root node produces the target spec
+    root_node = nodes[root_node_id]
+    if not specs_compatible(root_node.output_spec, target_spec):
+        raise ValueError(
+            f"Generated graph root node produces {root_node.output_spec}, "
+            f"but target spec is {target_spec}"
+        )
+
+    return graph
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/tensor_fuzzer.py b/tools/experimental/dynamic_shapes/torchfuzz/tensor_fuzzer.py
new file mode 100644
index 0000000000000..17fa2a83d9a38
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/tensor_fuzzer.py
@@ -0,0 +1,529 @@
+# mypy: ignore-errors
+import random
+from typing import NamedTuple, Optional, Union
+
+import torch
+
+
+# Global configuration for tensor fuzzing
+class FuzzerConfig:
+    """Global configuration for tensor fuzzing behavior."""
+
+    use_real_values: bool = True  # If False, use zeros; if True, use random values
+    avoid_complex: bool = False  # If True, exclude complex dtypes from fuzzing
+
+
+class TensorSpec(NamedTuple):
+    """Specification for a tensor argument."""
+
+    size: tuple[int, ...]
+    stride: tuple[int, ...]
+    dtype: torch.dtype
+
+
+class ScalarSpec(NamedTuple):
+    """Specification for a scalar argument."""
+
+    dtype: torch.dtype
+    constant: Optional[Union[int, float, bool, complex]] = (
+        None  # If set, use this constant value instead of fuzzing
+    )
+
+
+# Union type for specs
+Spec = Union[TensorSpec, ScalarSpec]
+
+
+def fuzz_torch_tensor_type() -> torch.dtype:
+    """
+    Fuzzes PyTorch tensor data types by randomly selecting and returning different dtypes.
+
+    Returns:
+        torch.dtype: A randomly selected PyTorch tensor data type
+    """
+
+    # Available PyTorch tensor data types (excluding unsigned types to avoid compatibility issues)
+    tensor_dtypes: list[torch.dtype] = [
+        torch.float32,
+        torch.float64,
+        torch.float16,
+        torch.bfloat16,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.bool,
+        torch.complex64,
+        torch.complex128,
+    ]
+
+    # Filter out complex dtypes if avoid_complex is enabled
+    if FuzzerConfig.avoid_complex:
+        tensor_dtypes = [
+            dtype
+            for dtype in tensor_dtypes
+            if dtype not in [torch.complex64, torch.complex128]
+        ]
+
+    # Randomly select and return a data type
+    return random.choice(tensor_dtypes)
+
+
+def fuzz_tensor_size(max_dims: int = 6, max_size_per_dim: int = 30) -> tuple[int, ...]:
+    """
+    Fuzzes PyTorch tensor sizes by generating random tensor shapes.
+
+    Args:
+        max_dims: Maximum number of dimensions (default: 6)
+        max_size_per_dim: Maximum size for each dimension (default: 100)
+
+    Returns:
+        Tuple[int, ...]: A tuple representing tensor shape/size
+    """
+
+    # Randomly choose number of dimensions (0 to max_dims)
+    # 0 dimensions = scalar tensor
+    num_dims: int = random.randint(0, max_dims)
+
+    if num_dims == 0:
+        # Scalar tensor (0-dimensional)
+        return ()
+
+    # Generate random sizes for each dimension
+    sizes: list[int] = []
+    for _ in range(num_dims):
+        # Include edge cases:
+        # - 5% chance of size 0 (empty tensor in that dimension)
+        # - 10% chance of size 1 (singleton dimension)
+        # - 80% chance of normal size (2 to max_size_per_dim)
+
+        rand_val: float = random.random()
+        if rand_val < 0.05:
+            # Empty dimension
+            size: int = 0
+        elif rand_val < 0.2:
+            # Singleton dimension
+            size = 1
+        else:
+            # Normal size
+            size = random.randint(2, max_size_per_dim)
+
+        sizes.append(size)
+
+    return tuple(sizes)
+
+
+def fuzz_valid_stride(size: tuple[int, ...]) -> tuple[int, ...]:
+    """
+    Fuzzes PyTorch tensor strides by generating valid stride patterns for a given size.
+
+    Args:
+        size: Tensor shape/size as a tuple of integers
+
+    Returns:
+        Tuple[int, ...]: A tuple representing valid tensor strides
+    """
+
+    if len(size) == 0:
+        # Scalar tensor has no strides
+        return ()
+
+    # Choose stride pattern type
+    stride_types = [
+        "contiguous",  # Normal contiguous memory layout
+        "transposed",  # Transposed dimensions
+        "custom_gaps",  # Custom strides with gaps (non-dense)
+        "minimal",  # Minimal valid strides (all ones)
+        "nonoverlapping_and_dense",  # Non-overlapping and dense (contiguous)
+        "nonoverlapping_and_dense_non_contig",  # Non-overlapping and dense but not contiguous
+        "overlapping",  # Overlapping memory access (zero strides)
+        "sparse_gaps",  # Large gaps (definitely non-dense)
+    ]
+
+    stride_type: str = random.choice(stride_types)
+
+    if stride_type in ["contiguous", "nonoverlapping_and_dense"]:
+        # Standard contiguous strides: stride[i] = product of sizes[i+1:]
+        return tuple(_compute_contiguous_strides(size))
+
+    elif stride_type == "transposed":
+        # Create transposed version - swap some dimensions' strides
+        base_strides = list(_compute_contiguous_strides(size))
+
+        if len(base_strides) >= 2:
+            # Randomly swap strides of two dimensions
+            i, j = random.sample(range(len(base_strides)), 2)
+            base_strides[i], base_strides[j] = base_strides[j], base_strides[i]
+
+        return tuple(base_strides)
+
+    elif stride_type == "custom_gaps":
+        # Create strides with custom gaps/spacing
+        base_strides = list(_compute_contiguous_strides(size))
+
+        # Add random gaps to some strides
+        for i in range(len(base_strides)):
+            if size[i] != 0 and random.random() < 0.3:  # 30% chance to add gap
+                gap_multiplier: int = random.randint(2, 5)
+                base_strides[i] *= gap_multiplier
+
+        return tuple(base_strides)
+
+    elif stride_type == "minimal":
+        # Minimal valid strides (all ones)
+        return tuple([1] * len(size))
+
+    elif stride_type == "nonoverlapping_and_dense_non_contig":
+        # Non-overlapping and dense but not contiguous (e.g., column-major)
+        return tuple(_compute_non_contiguous_dense_strides(size))
+
+    elif stride_type == "overlapping":
+        # Create overlapping strides (zero strides for some dimensions)
+        base_strides = list(_compute_contiguous_strides(size))
+
+        # Randomly set some strides to 0 to cause overlapping
+        for i in range(len(base_strides)):
+            if size[i] > 1 and random.random() < 0.4:  # 40% chance to make overlapping
+                base_strides[i] = 0
+
+        return tuple(base_strides)
+
+    elif stride_type == "sparse_gaps":
+        # Create strides with very large gaps (definitely non-dense)
+        base_strides = list(_compute_contiguous_strides(size))
+
+        # Add very large gaps to create sparse layout
+        for i in range(len(base_strides)):
+            if size[i] > 1:
+                gap_multiplier: int = random.randint(10, 100)  # Much larger gaps
+                base_strides[i] *= gap_multiplier
+
+        return tuple(base_strides)
+
+    # Fallback to contiguous
+    return tuple(_compute_contiguous_strides(size))
+
+
+def _compute_contiguous_strides(size: tuple[int, ...]) -> list[int]:
+    """
+    Helper function to compute standard contiguous strides for a given size.
+
+    Args:
+        size: Tensor shape/size as a tuple of integers
+
+    Returns:
+        list[int]: List of contiguous strides
+    """
+    strides: list[int] = []
+    current_stride: int = 1
+
+    # Calculate strides from right to left
+    for i in range(len(size) - 1, -1, -1):
+        strides.insert(0, current_stride)
+        # For dimensions with size 0, keep stride as is
+        if size[i] != 0:
+            current_stride *= size[i]
+
+    return strides
+
+
+def _compute_non_contiguous_dense_strides(size: tuple[int, ...]) -> list[int]:
+    """
+    Helper function to compute non-contiguous but dense strides (e.g., column-major order).
+
+    Args:
+        size: Tensor shape/size as a tuple of integers
+
+    Returns:
+        list[int]: List of non-contiguous dense strides
+    """
+    if len(size) <= 1:
+        # For 0D or 1D tensors, return same as contiguous
+        return _compute_contiguous_strides(size)
+
+    # Generate different dense patterns
+    patterns = [
+        "column_major",  # Reverse order (left to right instead of right to left)
+        "random_permute",  # Random permutation of dimensions
+        "middle_out",  # Start from middle dimension
+    ]
+
+    pattern: str = random.choice(patterns)
+
+    if pattern == "column_major":
+        # Column-major order: calculate strides from left to right
+        strides: list[int] = [0] * len(size)
+        current_stride: int = 1
+
+        # Calculate strides from left to right (opposite of contiguous)
+        for i in range(len(size)):
+            strides[i] = current_stride
+            # For dimensions with size 0, keep stride as is
+            if size[i] != 0:
+                current_stride *= size[i]
+
+        return strides
+
+    elif pattern == "random_permute":
+        # Create a valid permutation that's still dense
+        # Create dimension permutation
+        indices = list(range(len(size)))
+        random.shuffle(indices)
+
+        # Apply permutation to get new dense layout
+        new_strides = [0] * len(size)
+        current_stride = 1
+
+        # Sort indices by their corresponding size to maintain density
+        sorted_indices = sorted(
+            indices, key=lambda i: size[i] if size[i] != 0 else float("inf")
+        )
+
+        for idx in sorted_indices:
+            new_strides[idx] = current_stride
+            if size[idx] != 0:
+                current_stride *= size[idx]
+
+        return new_strides
+
+    elif pattern == "middle_out":
+        # Start from middle dimension and work outward
+        strides = [0] * len(size)
+        current_stride = 1
+
+        # Start from middle
+        middle = len(size) // 2
+        processed = [False] * len(size)
+
+        # Process middle first
+        strides[middle] = current_stride
+        if size[middle] != 0:
+            current_stride *= size[middle]
+        processed[middle] = True
+
+        # Process alternating left and right
+        for offset in range(1, len(size)):
+            for direction in [-1, 1]:
+                idx = middle + direction * offset
+                if 0 <= idx < len(size) and not processed[idx]:
+                    strides[idx] = current_stride
+                    if size[idx] != 0:
+                        current_stride *= size[idx]
+                    processed[idx] = True
+                    break
+
+        return strides
+
+    # Fallback to contiguous
+    return _compute_contiguous_strides(size)
+
+
+def _compute_storage_size_needed(
+    size: tuple[int, ...], strides: tuple[int, ...]
+) -> int:
+    """Compute minimum storage size needed for given shape and strides."""
+    if not size:
+        return 1
+
+    # Find maximum offset
+    max_offset = 0
+    for dim_size, stride in zip(size, strides):
+        if dim_size > 1:
+            max_offset += (dim_size - 1) * abs(stride)
+
+    return max_offset + 1
+
+
+def fuzz_tensor(
+    size: Optional[tuple[int, ...]] = None,
+    stride: Optional[tuple[int, ...]] = None,
+    dtype: Optional[torch.dtype] = None,
+    seed: Optional[int] = None,
+) -> tuple[torch.Tensor, int]:
+    """
+    Create a tensor with fuzzed size, stride, and dtype.
+
+    Args:
+        size: Tensor shape. If None, will be randomly generated.
+        stride: Tensor stride. If None, will be randomly generated based on size.
+        dtype: Tensor data type. If None, will be randomly generated.
+        seed: Random seed for reproducibility. If None, will be randomly generated.
+
+    Returns:
+        Tuple[torch.Tensor, int]: A tuple of (tensor, seed_used) where tensor has
+        the specified or randomly generated properties, and seed_used is the seed
+        that was used for generation (for reproducibility).
+    """
+    # Generate or use provided seed
+    if seed is None:
+        seed = random.randint(0, 2**32 - 1)
+
+    # Set the random seed for reproducibility
+    torch.manual_seed(seed)
+    random.seed(seed)
+
+    # Generate random values if not provided
+    if size is None:
+        size = fuzz_tensor_size()
+
+    if dtype is None:
+        dtype = fuzz_torch_tensor_type()
+
+    if stride is None:
+        stride = fuzz_valid_stride(size)
+
+    # Handle empty tensor case
+    if len(size) == 0:
+        return torch.zeros((), dtype=dtype), seed
+
+    # Calculate required storage size for the custom stride
+    required_storage = _compute_storage_size_needed(size, stride)
+
+    # Create base tensor with sufficient storage
+    if FuzzerConfig.use_real_values:
+        # Use random values based on dtype
+        if dtype.is_floating_point:
+            base_tensor = torch.randn(required_storage, dtype=dtype)
+        elif dtype in [torch.complex64, torch.complex128]:
+            # Create complex tensor with random real and imaginary parts
+            real_part = torch.randn(
+                required_storage,
+                dtype=torch.float32 if dtype == torch.complex64 else torch.float64,
+            )
+            imag_part = torch.randn(
+                required_storage,
+                dtype=torch.float32 if dtype == torch.complex64 else torch.float64,
+            )
+            base_tensor = torch.complex(real_part, imag_part).to(dtype)
+        elif dtype == torch.bool:
+            base_tensor = torch.randint(0, 2, (required_storage,), dtype=torch.bool)
+        else:  # integer types
+            base_tensor = torch.randint(-100, 100, (required_storage,), dtype=dtype)
+    else:
+        # Use zeros (default behavior)
+        base_tensor = torch.zeros(required_storage, dtype=dtype)
+
+    # Create strided tensor view
+    strided_tensor = torch.as_strided(base_tensor, size, stride)
+
+    return strided_tensor, seed
+
+
+def fuzz_tensor_simple(
+    size: Optional[tuple[int, ...]] = None,
+    stride: Optional[tuple[int, ...]] = None,
+    dtype: Optional[torch.dtype] = None,
+    seed: Optional[int] = None,
+) -> torch.Tensor:
+    """
+    Convenience function that returns just the tensor without the seed.
+
+    Args:
+        size: Tensor shape. If None, will be randomly generated.
+        stride: Tensor stride. If None, will be randomly generated based on size.
+        dtype: Tensor data type. If None, will be randomly generated.
+        seed: Random seed for reproducibility. If None, uses current random state.
+
+    Returns:
+        torch.Tensor: A tensor with the specified or randomly generated properties.
+    """
+    tensor, _ = fuzz_tensor(size, stride, dtype, seed)
+    return tensor
+
+
+def fuzz_non_contiguous_dense_tensor(
+    size: Optional[tuple[int, ...]] = None, dtype: Optional[torch.dtype] = None
+) -> torch.Tensor:
+    """
+    Specifically generates tensors that are non-contiguous but dense and non-overlapping.
+
+    Args:
+        size: Tensor shape/size. If None, auto-generated.
+        dtype: PyTorch tensor data type. If None, auto-generated.
+
+    Returns:
+        torch.Tensor: A non-contiguous but dense tensor
+    """
+    if dtype is None:
+        dtype = fuzz_torch_tensor_type()
+
+    if size is None:
+        size = fuzz_tensor_size()
+
+    # Force non-contiguous but dense stride patterns
+    if len(size) <= 1:
+        # For 0D or 1D tensors, return contiguous (they're trivially dense)
+        tensor, _ = fuzz_tensor(size, None, dtype)
+        return tensor
+
+    # Choose from patterns that guarantee non-contiguous but dense
+    patterns = ["column_major", "transposed", "permuted_dense"]
+
+    pattern = random.choice(patterns)
+
+    if pattern == "column_major":
+        # Column-major order (non-contiguous but dense)
+        stride = tuple(_compute_non_contiguous_dense_strides(size))
+    elif pattern == "transposed":
+        # Simple transpose of last two dimensions
+        base_strides = _compute_contiguous_strides(size)
+        if len(base_strides) >= 2:
+            # Swap last two dimensions' strides
+            base_strides[-1], base_strides[-2] = base_strides[-2], base_strides[-1]
+        stride = tuple(base_strides)
+    else:  # permuted_dense
+        # Random permutation that maintains density
+        stride = tuple(_compute_non_contiguous_dense_strides(size))
+
+    tensor, _ = fuzz_tensor(size, stride, dtype)
+    return tensor
+
+
+def fuzz_scalar(spec, seed: Optional[int] = None) -> Union[float, int, bool, complex]:
+    """
+    Create a Python scalar value from a ScalarSpec.
+
+    Args:
+        spec: ScalarSpec containing the desired dtype and optionally a constant value
+        seed: Random seed for reproducibility. If None, uses current random state.
+
+    Returns:
+        Python scalar (float, int, bool, complex) matching the dtype
+    """
+    # If a constant value is specified, use it directly
+    if spec.constant is not None:
+        return spec.constant
+
+    # Set seed for reproducibility if provided
+    if seed is not None:
+        random.seed(seed)
+
+    # Create a scalar value based on dtype
+    if spec.dtype.is_floating_point:
+        return random.uniform(-10.0, 10.0)
+    elif spec.dtype in [torch.complex64, torch.complex128]:
+        # Only generate complex values if not avoiding complex dtypes
+        if FuzzerConfig.avoid_complex:
+            raise ValueError("Cannot generate complex values with avoid_complex=True")
+        return complex(random.uniform(-10.0, 10.0), random.uniform(-10.0, 10.0))
+    else:  # integer or bool
+        if spec.dtype == torch.bool:
+            return random.choice([True, False])
+        else:
+            return random.randint(-10, 10)
+
+
+def specs_compatible(spec1: Spec, spec2: Spec) -> bool:
+    """Check if two specifications are compatible (one can be used where the other is expected)."""
+    if type(spec1) != type(spec2):
+        return False
+
+    if isinstance(spec1, ScalarSpec):
+        # For scalars, require exact dtype match for simplicity
+        return spec1.dtype == spec2.dtype
+    elif isinstance(spec1, TensorSpec):
+        assert isinstance(spec2, TensorSpec)
+        # For tensors, shape and dtype should match exactly
+        return spec1.size == spec2.size and spec1.dtype == spec2.dtype
+
+    return False
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/type_promotion.py b/tools/experimental/dynamic_shapes/torchfuzz/type_promotion.py
new file mode 100644
index 0000000000000..102338958dff5
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/type_promotion.py
@@ -0,0 +1,194 @@
+"""Type promotion utilities for torchfuzz operators."""
+
+import random
+
+import torch
+
+
+# Define promotion chains - types that can promote to the target
+# PyTorch promotion hierarchy (simplified):
+# - bool < int8 < int16 < int32 < int64 < float16 < float32 < float64 < complex64 < complex128
+# - uint types have limited promotion support
+PROMOTION_CHAINS = {
+    torch.bool: [torch.bool],
+    torch.int8: [torch.bool, torch.int8],
+    torch.int16: [torch.bool, torch.int8, torch.int16],
+    torch.int32: [torch.bool, torch.int8, torch.int16, torch.int32],
+    torch.int64: [torch.bool, torch.int8, torch.int16, torch.int32, torch.int64],
+    torch.float16: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+    ],
+    torch.float32: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.float32,
+    ],
+    torch.float64: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+    ],
+    torch.complex64: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.float32,
+        torch.complex64,
+    ],
+    torch.complex128: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+        torch.complex64,
+        torch.complex128,
+    ],
+}
+
+
+def get_promoted_dtypes(target_dtype: torch.dtype) -> list[torch.dtype]:
+    """
+    Generate two dtypes that will promote to target_dtype via PyTorch's type promotion rules.
+    """
+    # Get compatible input types for the target dtype
+    compatible_types = PROMOTION_CHAINS.get(target_dtype, [target_dtype])
+
+    # Strategy: Choose between same type or mixed promotion
+    strategies = ["same_type", "mixed_promotion"]
+    strategy = random.choice(strategies)
+
+    if strategy == "same_type":
+        # Both args same type as target
+        return [target_dtype, target_dtype]
+
+    else:  # mixed_promotion
+        # Mixed types where the result will promote to target_dtype
+        lower_types = compatible_types[:-1]  # All except the last (target_dtype)
+
+        if lower_types:
+            # One arg is target_dtype, one is lower (will promote to target)
+            lower_dtype = random.choice(lower_types)
+            if random.random() < 0.5:
+                return [target_dtype, lower_dtype]
+            else:
+                return [lower_dtype, target_dtype]
+        else:
+            # Fallback to same type if no lower types available
+            return [target_dtype, target_dtype]
+
+
+def get_dtype_name(dtype: torch.dtype) -> str:
+    """Get string name for a torch dtype."""
+    return str(dtype).split(".")[-1]
+
+
+def get_promotion_table_for_strings() -> dict:
+    """
+    Get promotion table using string dtype names for backward compatibility.
+    Returns dictionary mapping output dtype string to possible input dtype string pairs.
+    """
+    return {
+        "float32": [
+            ("float32", "float32"),
+            ("bfloat16", "float32"),
+            ("float32", "bfloat16"),
+            ("float16", "float32"),
+            ("float32", "float16"),
+        ],
+        "bfloat16": [
+            ("bfloat16", "bfloat16"),
+            ("float32", "bfloat16"),
+            ("bfloat16", "float32"),
+        ],
+        "float16": [
+            ("float16", "float16"),
+            ("float32", "float16"),
+            ("float16", "float32"),
+        ],
+        "int32": [
+            ("int32", "int32"),
+            ("int64", "int32"),
+            ("int32", "int64"),
+        ],
+        "int64": [
+            ("int64", "int64"),
+            ("int32", "int64"),
+            ("int64", "int32"),
+        ],
+    }
+
+
+def get_dtype_map() -> dict:
+    """Get mapping from string names to torch dtypes."""
+    return {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "int32": torch.int32,
+        "int64": torch.int64,
+        "bool": torch.bool,
+        "int8": torch.int8,
+        "int16": torch.int16,
+        "float64": torch.float64,
+        "complex64": torch.complex64,
+        "complex128": torch.complex128,
+    }
+
+
+def get_scalar_promotion_pairs(
+    target_dtype: torch.dtype,
+) -> list[tuple[torch.dtype, torch.dtype]]:
+    """
+    Get promotion pairs for scalar operations.
+    Returns list of (dtype1, dtype2) tuples that promote to target_dtype.
+    """
+    return (
+        [
+            (torch.float32, torch.float32),
+            (torch.float16, torch.float32),
+            (torch.float32, torch.float16),
+            (torch.int32, torch.float32),
+            (torch.float32, torch.int32),
+        ]
+        if target_dtype == torch.float32
+        else [
+            (torch.float64, torch.float64),
+            (torch.float32, torch.float64),
+            (torch.float64, torch.float32),
+        ]
+        if target_dtype == torch.float64
+        else [
+            (torch.int32, torch.int32),
+            (torch.int64, torch.int32),
+            (torch.int32, torch.int64),
+        ]
+        if target_dtype == torch.int32
+        else [
+            (torch.int64, torch.int64),
+            (torch.int32, torch.int64),
+            (torch.int64, torch.int32),
+        ]
+        if target_dtype == torch.int64
+        else [(target_dtype, target_dtype)]
+    )
diff --git a/tools/experimental/dynamic_shapes/torchfuzz/visualize_graph.py b/tools/experimental/dynamic_shapes/torchfuzz/visualize_graph.py
new file mode 100644
index 0000000000000..7286560fd49fa
--- /dev/null
+++ b/tools/experimental/dynamic_shapes/torchfuzz/visualize_graph.py
@@ -0,0 +1,227 @@
+# mypy: ignore-errors
+
+"""
+Visualization tools for operation stacks and graphs as DAGs.
+"""
+
+import subprocess
+
+from ops_fuzzer import OperationGraph
+from tensor_fuzzer import TensorSpec
+
+
+def save_and_render_dot(dot_content: str, filename: str = "operation_stack"):
+    """
+    Save DOT content to file and render as PNG/PDF.
+
+    Args:
+        dot_content: DOT format string
+        filename: Base filename (without extension)
+    """
+    import os
+
+    dot_file = f"{filename}.dot"
+    png_file = f"{filename}.png"
+
+    # Get absolute path for clickable link
+    abs_png = os.path.abspath(png_file)
+
+    # Save DOT file
+    with open(dot_file, "w") as f:
+        f.write(dot_content)
+
+    # Render to PNG
+    try:
+        subprocess.run(["dot", "-Tpng", dot_file, "-o", png_file], check=True)
+        print(f"🖼️  View: file://{abs_png}")
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+
+
+def operation_graph_to_dot(
+    graph: OperationGraph, title: str = "Operation Graph"
+) -> str:
+    """
+    Convert an operation graph to Graphviz DOT format for visualization.
+
+    Args:
+        graph: OperationGraph instance
+        title: Title for the graph
+
+    Returns:
+        DOT format string
+    """
+    dot_lines = [
+        "digraph OperationGraph {",
+        f'    label="{title}";',
+        "    rankdir=TB;",  # Top to bottom layout
+        "    node [shape=box, style=filled, fontsize=10];",
+        "    edge [fontsize=8];",
+        "",
+    ]
+
+    # Add nodes with styling based on operation type
+    for node_id, node in graph.nodes.items():
+        # Choose color and shape based on operation type
+        if node.op_name.startswith("arg_"):
+            color = "lightblue"
+            shape = "ellipse"
+        elif node.op_name == "constant":
+            color = "lightgreen"
+            shape = "ellipse"
+        elif "aten" in node.op_name:
+            color = "lightyellow"
+            shape = "box"
+        else:
+            color = "lightgray"
+            shape = "box"
+
+        # Create comprehensive label
+        if node.op_name.startswith("arg_"):
+            label_parts = [node.op_name]
+        else:
+            label_parts = [node_id, node.op_name, f"depth {node.depth}"]
+
+        if hasattr(node.output_spec, "dtype"):
+            dtype_str = str(node.output_spec.dtype).replace("torch.", "")
+            label_parts.append(dtype_str)
+
+        # Only add size for TensorSpec, not ScalarSpec
+        if isinstance(node.output_spec, TensorSpec) and node.output_spec.size:
+            size_str = "x".join(map(str, node.output_spec.size))
+            label_parts.append(f"size {size_str}")
+
+        label = "\\n".join(label_parts)
+
+        # Special highlighting for root node
+        extra_style = ""
+        if node_id == graph.root_node_id:
+            extra_style = ", penwidth=3, color=red"
+
+        dot_lines.append(
+            f'    {node_id} [label="{label}", fillcolor="{color}", shape="{shape}"{extra_style}];'
+        )
+
+    dot_lines.append("")
+
+    # Add edges based on the graph structure
+    for node_id, node in graph.nodes.items():
+        for i, input_node_id in enumerate(node.input_nodes):
+            # Add edge from input node to current node with input position label
+            edge_label = f"input_{i}"
+            dot_lines.append(
+                f'    {input_node_id} -> {node_id} [label="{edge_label}"];'
+            )
+
+    dot_lines.extend(
+        [
+            "",
+            "    // Legend",
+            "    subgraph cluster_legend {",
+            '        label="Legend";',
+            "        style=filled;",
+            "        fillcolor=white;",
+            '        legend_arg [label="arg", fillcolor=lightblue, shape=ellipse];',
+            '        legend_const [label="constant", fillcolor=lightgreen, shape=ellipse];',
+            '        legend_aten [label="aten ops", fillcolor=lightyellow, shape=box];',
+            '        legend_root [label="root", fillcolor=orange, shape=box, penwidth=3, color=red];',
+            "    }",
+            "}",
+        ]
+    )
+
+    return "\n".join(dot_lines)
+
+
+def visualize_operation_graph(
+    graph: OperationGraph,
+    title: str = "Operation Graph",
+    output_folder: str = ".",
+):
+    """
+    Complete visualization pipeline for an operation graph.
+
+    Args:
+        graph: OperationGraph instance
+        title: Title for the visualization
+        output_folder: Folder where to save the visualization files
+    """
+    # Generate DOT content
+    dot_content = operation_graph_to_dot(graph, title)
+
+    # Save and render in the specified folder
+    import os
+
+    filename = os.path.join(output_folder, "operation_graph")
+    save_and_render_dot(dot_content, filename)
+
+
+def operation_graph_to_networkx(graph: OperationGraph):
+    """
+    Convert operation graph to NetworkX graph for Python visualization.
+    Requires: pip install networkx matplotlib
+    """
+    try:
+        import matplotlib.pyplot as plt
+        import networkx as nx
+    except ImportError:
+        print(
+            "⚠️  NetworkX/Matplotlib not installed. Run: pip install networkx matplotlib"
+        )
+        return
+
+    # Create directed graph
+    G = nx.DiGraph()
+
+    # Add nodes
+    for node_id, node in graph.nodes.items():
+        label = f"{node_id}\n{node.op_name}\ndepth {node.depth}"
+        G.add_node(node_id, label=label, node=node)
+
+    # Add edges based on the graph structure
+    for node_id, node in graph.nodes.items():
+        for input_node_id in node.input_nodes:
+            if input_node_id in graph.nodes:  # Only add edges to nodes in the graph
+                G.add_edge(input_node_id, node_id)
+
+    # Plot
+    plt.figure(figsize=(12, 8))
+    pos = nx.spring_layout(G, k=2, iterations=50)
+
+    # Draw nodes with colors based on operation type
+    node_colors = []
+    for node_id in G.nodes():
+        node = graph.nodes[node_id]
+        if node.op_name.startswith("arg_"):
+            node_colors.append("lightblue")
+        elif node.op_name == "constant":
+            node_colors.append("lightgreen")
+        elif "aten" in node.op_name:
+            node_colors.append("lightyellow")
+        else:
+            node_colors.append("lightgray")
+
+    # Highlight root node
+    node_sizes = []
+    for node_id in G.nodes():
+        if node_id == graph.root_node_id:
+            node_sizes.append(2000)  # Larger size for root
+        else:
+            node_sizes.append(1500)
+
+    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes)
+    nx.draw_networkx_edges(G, pos, edge_color="gray", arrows=True, arrowsize=20)
+
+    # Draw labels
+    labels = {
+        node_id: f"{node_id}\n{graph.nodes[node_id].op_name}" for node_id in G.nodes()
+    }
+    nx.draw_networkx_labels(G, pos, labels, font_size=8)
+
+    plt.title("Operation Graph Visualization")
+    plt.axis("off")
+    plt.tight_layout()
+    plt.savefig("operation_graph_networkx.png", dpi=300, bbox_inches="tight")
+    plt.show()
+
+    print("✓ NetworkX graph visualization saved as operation_graph_networkx.png")
diff --git a/tools/flight_recorder/components/builder.py b/tools/flight_recorder/components/builder.py
index e0aaef31c1c32..b2917a557b4da 100644
--- a/tools/flight_recorder/components/builder.py
+++ b/tools/flight_recorder/components/builder.py
@@ -134,6 +134,7 @@ def build_collectives(
     _memberships: dict[str, set[Any]],
     _pg_guids: dict[tuple[str, int], str],
     version: str,
+    mismatch_cap: int = 10,
 ) -> tuple[list[Traceback], list[Collective], list[NCCLCall]]:
     """
     groups, memberships are the non-flat dicts that are indexable
@@ -171,7 +172,6 @@ def build_collectives(
     # once we find one mismatch, we stop pairing up collectives since the pairing is possibly incorrect
     # instead, just record the remaining ops as NCCLCalls
     mismatch = {_groups[g].id: 0 for g in _groups}
-    MISMATCH_TAIL = 10
 
     # For best effort partial analysis.
     dumps_ranks = {int(key) for key in all_entries.keys()}
@@ -365,7 +365,7 @@ def build_collectives(
                     )
                 )
 
-        if mismatch[pg_name] > MISMATCH_TAIL:
+        if mismatch[pg_name] > mismatch_cap:
             logger.error(
                 "Too many mismatches for process_group %s: %s aborting", pg_name, desc
             )
@@ -412,7 +412,7 @@ def build_db(
         check_no_missing_dump_files(entries, memberships)
 
     tracebacks, collectives, nccl_calls = build_collectives(
-        entries, _groups, _memberships, _pg_guids, version
+        entries, _groups, _memberships, _pg_guids, version, args.mismatch_cap
     )
     logger.debug("built collectives, nccl_calls")
     if args.verbose:
diff --git a/tools/flight_recorder/components/config_manager.py b/tools/flight_recorder/components/config_manager.py
index abd7f5372133c..1b4eafc3631d2 100644
--- a/tools/flight_recorder/components/config_manager.py
+++ b/tools/flight_recorder/components/config_manager.py
@@ -68,6 +68,12 @@ def __init__(self: "JobConfig"):
         self.parser.add_argument("-j", "--just_print_entries", action="store_true")
         self.parser.add_argument("-v", "--verbose", action="store_true")
         self.parser.add_argument("--print_stack_trace", action="store_true")
+        self.parser.add_argument(
+            "--mismatch_cap",
+            type=int,
+            default=10,
+            help="Maximum number of mismatches we print (from earliest).",
+        )
 
     def parse_args(
         self: "JobConfig", args: Optional[Sequence[str]]
diff --git a/tools/linter/adapters/actionlint_linter.py b/tools/linter/adapters/actionlint_linter.py
index bebb95c499f8a..019f0fe896bcd 100644
--- a/tools/linter/adapters/actionlint_linter.py
+++ b/tools/linter/adapters/actionlint_linter.py
@@ -73,6 +73,8 @@ def check_file(
                 binary,
                 "-ignore",
                 '"runs-on" section must be sequence node but got mapping node with "!!map" tag',
+                "-ignore",
+                'input "freethreaded" is not defined in action "actions/setup-python@v',
                 file,
             ]
         )
diff --git a/tools/linter/adapters/import_linter.py b/tools/linter/adapters/import_linter.py
index 1b24556a03bd6..69c5ecc19fa5c 100644
--- a/tools/linter/adapters/import_linter.py
+++ b/tools/linter/adapters/import_linter.py
@@ -47,281 +47,8 @@ class LintMessage(NamedTuple):
 CURRENT_FILE_NAME = os.path.basename(__file__)
 _MODULE_NAME_ALLOW_LIST: set[str] = set()
 
-# Add builtin modules.
-if sys.version_info >= (3, 10):
-    _MODULE_NAME_ALLOW_LIST.update(sys.stdlib_module_names)
-else:
-    assert (sys.version_info.major, sys.version_info.minor) == (3, 9)
-    # Taken from `stdlib_list("3.9")` to avoid introducing a new dependency.
-    _MODULE_NAME_ALLOW_LIST.update(
-        [
-            "__future__",
-            "_abc",
-            "_aix_support",
-            "_ast",
-            "_bootlocale",
-            "_bootsubprocess",
-            "_codecs",
-            "_collections",
-            "_collections_abc",
-            "_compat_pickle",
-            "_compression",
-            "_crypt",
-            "_functools",
-            "_hashlib",
-            "_imp",
-            "_io",
-            "_locale",
-            "_lsprof",
-            "_markupbase",
-            "_operator",
-            "_osx_support",
-            "_peg_parser",
-            "_posixsubprocess",
-            "_py_abc",
-            "_pydecimal",
-            "_pyio",
-            "_random",
-            "_signal",
-            "_sitebuiltins",
-            "_socket",
-            "_sre",
-            "_ssl",
-            "_stat",
-            "_string",
-            "_strptime",
-            "_symtable",
-            "_sysconfigdata_x86_64_conda_cos6_linux_gnu",
-            "_sysconfigdata_x86_64_conda_linux_gnu",
-            "_thread",
-            "_threading_local",
-            "_tracemalloc",
-            "_uuid",
-            "_warnings",
-            "_weakref",
-            "_weakrefset",
-            "abc",
-            "aifc",
-            "antigravity",
-            "argparse",
-            "array",
-            "ast",
-            "asynchat",
-            "asyncio",
-            "asyncore",
-            "atexit",
-            "audioop",
-            "base64",
-            "bdb",
-            "binascii",
-            "binhex",
-            "bisect",
-            "builtins",
-            "bz2",
-            "cProfile",
-            "calendar",
-            "cgi",
-            "cgitb",
-            "chunk",
-            "cmath",
-            "cmd",
-            "code",
-            "codecs",
-            "codeop",
-            "collections",
-            "colorsys",
-            "compileall",
-            "concurrent",
-            "configparser",
-            "contextlib",
-            "contextvars",
-            "copy",
-            "copyreg",
-            "crypt",
-            "csv",
-            "ctypes",
-            "curses",
-            "dataclasses",
-            "datetime",
-            "dbm",
-            "decimal",
-            "difflib",
-            "dis",
-            "distutils",
-            "doctest",
-            "email",
-            "encodings",
-            "ensurepip",
-            "enum",
-            "errno",
-            "faulthandler",
-            "fcntl",
-            "filecmp",
-            "fileinput",
-            "fnmatch",
-            "formatter",
-            "fractions",
-            "ftplib",
-            "functools",
-            "gc",
-            "genericpath",
-            "getopt",
-            "getpass",
-            "gettext",
-            "glob",
-            "graphlib",
-            "grp",
-            "gzip",
-            "hashlib",
-            "heapq",
-            "hmac",
-            "html",
-            "http",
-            "idlelib",
-            "imaplib",
-            "imghdr",
-            "imp",
-            "importlib",
-            "inspect",
-            "io",
-            "ipaddress",
-            "itertools",
-            "json",
-            "keyword",
-            "lib2to3",
-            "linecache",
-            "locale",
-            "logging",
-            "lzma",
-            "mailbox",
-            "mailcap",
-            "marshal",
-            "math",
-            "mimetypes",
-            "mmap",
-            "modulefinder",
-            "msilib",
-            "msvcrt",
-            "multiprocessing",
-            "netrc",
-            "nis",
-            "nntplib",
-            "ntpath",
-            "nturl2path",
-            "numbers",
-            "opcode",
-            "operator",
-            "optparse",
-            "os",
-            "ossaudiodev",
-            "parser",
-            "pathlib",
-            "pdb",
-            "pickle",
-            "pickletools",
-            "pipes",
-            "pkgutil",
-            "platform",
-            "plistlib",
-            "poplib",
-            "posix",
-            "posixpath",
-            "pprint",
-            "profile",
-            "pstats",
-            "pty",
-            "pwd",
-            "py_compile",
-            "pyclbr",
-            "pydoc",
-            "pydoc_data",
-            "queue",
-            "quopri",
-            "random",
-            "re",
-            "readline",
-            "reprlib",
-            "resource",
-            "rlcompleter",
-            "runpy",
-            "sched",
-            "secrets",
-            "select",
-            "selectors",
-            "shelve",
-            "shlex",
-            "shutil",
-            "signal",
-            "site",
-            "smtpd",
-            "smtplib",
-            "sndhdr",
-            "socket",
-            "socketserver",
-            "spwd",
-            "sqlite3",
-            "sre_compile",
-            "sre_constants",
-            "sre_parse",
-            "ssl",
-            "stat",
-            "statistics",
-            "string",
-            "stringprep",
-            "struct",
-            "subprocess",
-            "sunau",
-            "symbol",
-            "symtable",
-            "sys",
-            "sysconfig",
-            "syslog",
-            "tabnanny",
-            "tarfile",
-            "telnetlib",
-            "tempfile",
-            "termios",
-            "test",
-            "textwrap",
-            "this",
-            "threading",
-            "time",
-            "timeit",
-            "tkinter",
-            "token",
-            "tokenize",
-            "trace",
-            "traceback",
-            "tracemalloc",
-            "tty",
-            "turtle",
-            "turtledemo",
-            "types",
-            "typing",
-            "unicodedata",
-            "unittest",
-            "urllib",
-            "uu",
-            "uuid",
-            "venv",
-            "warnings",
-            "wave",
-            "weakref",
-            "webbrowser",
-            "winreg",
-            "winsound",
-            "wsgiref",
-            "xdrlib",
-            "xml",
-            "xmlrpc",
-            "xxsubtype",
-            "zipapp",
-            "zipfile",
-            "zipimport",
-            "zlib",
-            "zoneinfo",
-        ]
-    )
+# Add builtin modules of python.
+_MODULE_NAME_ALLOW_LIST.update(sys.stdlib_module_names)
 
 # Add the allowed third party libraries. Please avoid updating this unless you
 # understand the risks -- see `_ERROR_MESSAGE` for why.
diff --git a/tools/linter/dictionary.txt b/tools/linter/dictionary.txt
index 706881a8f10f6..c4a250db04836 100644
--- a/tools/linter/dictionary.txt
+++ b/tools/linter/dictionary.txt
@@ -12,6 +12,7 @@ BU
 contiguities
 contiguity
 coo
+DEPENDEES
 deser
 din
 dout
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 0dc1e8de37d8c..cb5d69009f748 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -663,6 +663,156 @@ def gen_nn_functional(fm: FileManager) -> None:
                     "Tensor",
                 )
             ],
+            "im2col": [
+                defs(
+                    "im2col",
+                    [
+                        INPUT,
+                        KERNEL_SIZE,
+                        "dilation: _int | _size",
+                        "padding: _int | _size",
+                        "stride: _int | _size",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "l1_loss": [
+                defs(
+                    "l1_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "mish": [
+                defs(
+                    "mish",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "mish_": [
+                defs(
+                    "mish_",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "mse_loss": [
+                defs(
+                    "mse_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "multilabel_margin_loss": [
+                defs(
+                    "multilabel_margin_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "multi_margin_loss": [
+                defs(
+                    "multi_margin_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "p: float = 1.0",
+                        "margin: float = 1.0",
+                        "weight: Tensor | None = None",
+                        "reduction: str = ...",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "nll_loss_nd": [
+                defs(
+                    "nll_loss_nd",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "weight: Tensor | None = None",
+                        "reduction: str = ...",
+                        "ignore_index: int = -100",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "relu6": [
+                defs(
+                    "relu6",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "relu6_": [
+                defs(
+                    "relu6_",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "silu": [
+                defs(
+                    "silu",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "silu_": [
+                defs(
+                    "silu_",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "smooth_l1_loss": [
+                defs(
+                    "smooth_l1_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                        "beta: float = 1.0",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "soft_margin_loss": [
+                defs(
+                    "soft_margin_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                    ],
+                    "Tensor",
+                )
+            ],
         }
     )
 
@@ -1430,15 +1580,14 @@ def replace_special_case(hint: str) -> str:
                     "S",
                 )
             ],
-            "_make_dtensor": [
+            "_dtensor__new__": [
                 "@staticmethod\n"
                 + defs(
-                    "_make_dtensor",
+                    "_dtensor__new__",
                     [
                         "cls: type[S]",
-                        "size: Sequence[_int | SymInt]",
-                        "strides: Sequence[_int | SymInt]",
                         "local_tensor: Tensor",
+                        "spec: torch.distributed.tensor._dtensor_spec.DTensorSpec",
                         "requires_grad: _bool",
                     ],
                     "S",
diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py
index e66fc197062ad..b5a7a4ce7dec9 100644
--- a/tools/setup_helpers/generate_linker_script.py
+++ b/tools/setup_helpers/generate_linker_script.py
@@ -1,5 +1,7 @@
+import argparse
 import os
 import subprocess
+from pathlib import Path
 
 
 def gen_linker_script(
@@ -28,6 +30,10 @@ def gen_linker_script(
     assert len(text_line_start) == 1, "The linker script has multiple text sections!"
     text_line_start = text_line_start[0]
 
+    # ensure that parent directory exists before writing
+    fout = Path(fout)
+    fout.parent.mkdir(parents=True, exist_ok=True)
+
     with open(fout, "w") as f:
         for lineid, line in enumerate(linker_script_lines):
             if lineid == text_line_start + 2:
@@ -36,3 +42,20 @@ def gen_linker_script(
                     f.write(f"      .text.{plines}\n")
                 f.write("    )\n")
             f.write(f"{line}\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate linker file based on prioritized symbols. Used for link-time optimization.",
+    )
+    parser.add_argument(
+        "--filein",
+        help="Path to prioritized_text.txt input file",
+        default=argparse.SUPPRESS,
+    )
+    parser.add_argument(
+        "--fout", help="Output path for linker ld file", default=argparse.SUPPRESS
+    )
+    # convert args to a dict to pass to gen_linker_script
+    kwargs = vars(parser.parse_args())
+    gen_linker_script(**kwargs)
diff --git a/tools/testing/discover_tests.py b/tools/testing/discover_tests.py
index 96aee230f89f8..13511b1ec1290 100644
--- a/tools/testing/discover_tests.py
+++ b/tools/testing/discover_tests.py
@@ -73,7 +73,6 @@ def skip_test_p(name: str) -> bool:
     cpp_tests_dir=CPP_TESTS_DIR,
     blocklisted_patterns=[
         "ao",
-        "bottleneck_test",
         "custom_backend",
         "custom_operator",
         "fx",  # executed by test_fx.py
@@ -83,6 +82,7 @@ def skip_test_p(name: str) -> bool:
         "package",  # executed by test_package.py
         "quantization",  # executed by test_quantization.py
         "autograd",  # executed by test_autograd.py
+        "cpp_extensions/open_registration_extension/torch_openreg/tests",  # executed by test_openreg.py
     ],
     blocklisted_tests=[
         "test_bundled_images",
@@ -138,6 +138,7 @@ def skip_test_p(name: str) -> bool:
         "doctests",
         "test_autoload_enable",
         "test_autoload_disable",
+        "test_openreg",
     ],
 )
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index aac3cabf37a80..4c5625414c481 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -41,6 +41,7 @@ from torch._C import (
 from torch._prims_common import DeviceLikeType
 from torch.autograd.graph import Node as _Node
 from torch.cuda import _POOL_HANDLE
+from torch.distributed.tensor._op_schema import OpSchema
 from torch.fx.node import Node as FxNode
 from torch.package import PackageExporter
 from torch.storage import TypedStorage, UntypedStorage
@@ -1379,6 +1380,7 @@ def _get_linalg_preferred_backend() -> _LinalgBackend: ...
 def _set_linalg_preferred_backend(arg: _LinalgBackend): ...
 def _get_fp32_precision_getter(backend: str, op: str) -> str: ...
 def _set_fp32_precision_setter(backend: str, op: str, value: str) -> str: ...
+def _ensureCUDADeviceGuardSet() -> None: ...
 
 class _LinalgBackend:
     Default: _LinalgBackend
@@ -1618,6 +1620,9 @@ def _jit_pass_cse(Graph) -> _bool: ...
 def _jit_pass_dce(Graph) -> None: ...
 def _jit_pass_dce_graph(Graph) -> None: ...
 def _jit_pass_lint(Graph) -> None: ...
+def _make_opaque_object(payload: Any) -> ScriptObject: ...
+def _get_opaque_object_payload(obj: ScriptObject) -> Any: ...
+def _set_opaque_object_payload(obj: ScriptObject, payload: Any) -> None: ...
 
 # Defined in torch/csrc/jit/python/python_custom_class.cpp
 def _get_custom_class_python_wrapper(name: str, attr: str) -> Any: ...
@@ -1932,6 +1937,9 @@ class TensorBase(metaclass=_TensorMeta):
 
 _TensorBase = TensorBase
 
+def _DTensor_OpSchema_post_init(self: OpSchema) -> None: ...
+def _DTensor_OpSchema_recompute_comparison_key(self: OpSchema) -> None: ...
+
 # Defined in torch/csrc/multiprocessing/init.cpp
 def _multiprocessing_init() -> None: ...
 def _set_thread_name(name: str) -> None: ...
@@ -2194,6 +2202,9 @@ class _CudaDeviceProperties:
     warp_size: _int
     uuid: str
     L2_cache_size: _int
+    clock_rate: _int
+    memory_clock_rate: _int
+    memory_bus_width: _int
 
 # Functions related to SDPA
 class _SDPAParams:
@@ -2360,6 +2371,7 @@ def _xpu_memoryStats(device: _int) -> dict[str, Any]: ...
 def _xpu_resetAccumulatedMemoryStats(device: _int) -> None: ...
 def _xpu_resetPeakMemoryStats(device: _int) -> None: ...
 def _xpu_getMemoryInfo(device: _int) -> tuple[_int, _int]: ...
+def _xpu_canDeviceAccessPeer(device: _int, peer: _int) -> _bool: ...
 
 class _XpuDeviceProperties:
     name: str
diff --git a/torch/_C/_dynamo/guards.pyi b/torch/_C/_dynamo/guards.pyi
index aa6614504fc23..4e06c2f92efd7 100644
--- a/torch/_C/_dynamo/guards.pyi
+++ b/torch/_C/_dynamo/guards.pyi
@@ -224,6 +224,12 @@ class GuardManager:
     def add_lambda_guard(
         self, user_lambda: Callable[..., Any], verbose_code_parts: list[str]
     ) -> None: ...
+    def add_lambda_guard_no_args(
+        self, user_lambda: Callable[..., Any], verbose_code_parts: list[str]
+    ) -> None: ...
+    def add_lambda_guard_no_framelocals(
+        self, user_lambda: Callable[..., Any], verbose_code_parts: list[str]
+    ) -> None: ...
     def add_id_match_guard(
         self, id_val: int, verbose_code_parts: list[str]
     ) -> None: ...
@@ -310,6 +316,19 @@ class GuardManager:
         item: Any,
         verbose_code_parts: list[str],
     ) -> None: ...
+    def add_dual_level_match_guard(
+        self,
+        level: int,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_float_is_nan_guard(
+        self,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_complex_is_nan_guard(
+        self,
+        verbose_code_parts: list[str],
+    ) -> None: ...
     def add_tuple_iterator_length_guard(
         self,
         length: int,
diff --git a/torch/_C/_export/pt2_archive_constants.pyi b/torch/_C/_export/pt2_archive_constants.pyi
index ce225f0f1880b..f7a92ddd0c961 100644
--- a/torch/_C/_export/pt2_archive_constants.pyi
+++ b/torch/_C/_export/pt2_archive_constants.pyi
@@ -18,6 +18,7 @@ TENSOR_CONSTANT_FILENAME_PREFIX: str = ...
 CUSTOM_OBJ_FILENAME_PREFIX: str = ...
 SAMPLE_INPUTS_DIR: str = ...
 SAMPLE_INPUTS_FILENAME_FORMAT: str = ...
+EXECUTORCH_DIR: str = ...
 EXTRA_DIR: str = ...
 MODULE_INFO_PATH: str = ...
 XL_MODEL_WEIGHTS_DIR: str = ...
diff --git a/torch/_C/_functorch.pyi b/torch/_C/_functorch.pyi
index 2e37b3d109966..c23240e13170a 100644
--- a/torch/_C/_functorch.pyi
+++ b/torch/_C/_functorch.pyi
@@ -22,6 +22,7 @@ def _unwrap_batched(tensor: Tensor, level: int) -> tuple[Tensor, int | None]: ..
 def current_level() -> int: ...
 def count_jvp_interpreters() -> int: ...
 def _add_batch_dim(tensor: Tensor, bdim: int, level: int) -> Tensor: ...
+def _maybe_unsafe_set_level(tensor: Tensor, level: int) -> None: ...
 def set_single_level_autograd_function_allowed(allowed: bool) -> None: ...
 def get_single_level_autograd_function_allowed() -> bool: ...
 def _unwrap_functional_tensor(tensor: Tensor, reapply_views: bool) -> Tensor: ...
diff --git a/torch/__init__.py b/torch/__init__.py
index 7969b6edc787b..22f5206af653c 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -244,7 +244,7 @@ def _load_dll_libraries() -> None:
                 textwrap.dedent(
                     """
                     Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
-                    It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe
+                    It can be downloaded at https://aka.ms/vs/17/release/vc_redist.x64.exe
                     """
                 ).strip()
             )
@@ -283,10 +283,20 @@ def _load_dll_libraries() -> None:
 
 
 def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]:
-    # Libraries can either be in path/nvidia/lib_folder/lib or path/lib_folder/lib
+    # Libraries can either be in
+    # path/nvidia/lib_folder/lib or
+    # path/nvidia/cuXX/lib (since CUDA 13.0) or
+    # path/lib_folder/lib
+    from torch.version import cuda as cuda_version
+
     nvidia_lib_paths = glob.glob(
         os.path.join(path, "nvidia", lib_folder, "lib", lib_name)
     )
+    if cuda_version is not None:
+        maj_cuda_version = cuda_version.split(".")[0]
+        nvidia_lib_paths += glob.glob(
+            os.path.join(path, "nvidia", f"cu{maj_cuda_version}", "lib", lib_name)
+        )
     lib_paths = glob.glob(os.path.join(path, lib_folder, "lib", lib_name))
 
     return nvidia_lib_paths + lib_paths
@@ -330,12 +340,13 @@ def _load_global_deps() -> None:
         try:
             with open("/proc/self/maps") as f:
                 _maps = f.read()
-            # libtorch_global_deps.so always depends in cudart, check if its installed via wheel
-            if "nvidia/cuda_runtime/lib/libcudart.so" not in _maps:
+
+            # libtorch_global_deps.so always depends in cudart, check if its installed and loaded
+            if "libcudart.so" not in _maps:
                 return
             # If all above-mentioned conditions are met, preload nvrtc and nvjitlink
-            # Please note that order are important for CUDA-11.8 , as nvjitlink does not exist there
             _preload_cuda_deps("cuda_nvrtc", "libnvrtc.so.*[0-9]")
+            _preload_cuda_deps("cuda_nvrtc", "libnvrtc-builtins.so.*[0-9]")
             _preload_cuda_deps("nvjitlink", "libnvJitLink.so.*[0-9]")
         except Exception:
             pass
@@ -1129,6 +1140,14 @@ def is_storage(obj: _Any, /) -> _TypeIs[_Union["TypedStorage", "UntypedStorage"]
 
     Args:
         obj (Object): Object to test
+    Example::
+
+        >>> x = torch.tensor([1, 2, 3])
+        >>> torch.is_storage(x)
+        False
+        >>> torch.is_storage(x.untyped_storage())
+        True
+
     """
     return type(obj) in _storage_classes
 
@@ -2499,7 +2518,8 @@ def compile(
        fullgraph (bool): If False (default), torch.compile attempts to discover compilable regions
         in the function that it will optimize. If True, then we require that the entire function be
         capturable into a single graph. If this is not possible (that is, if there are graph breaks),
-        then this will raise an error.
+        then this will raise an error. This also opts into unbacked semantics, notably it will turn on
+        capture_scalar_outputs and capture_dynamic_output_shape_ops on by default.
        dynamic (bool or None): Use dynamic shape tracing.  When this is True, we will up-front attempt
         to generate a kernel that is as dynamic as possible to avoid recompilations when
         sizes change.  This may not always work as some operations/optimizations will
@@ -2810,10 +2830,7 @@ def _import_device_backends():
     from importlib.metadata import entry_points
 
     group_name = "torch.backends"
-    if sys.version_info < (3, 10):
-        backend_extensions = entry_points().get(group_name, ())
-    else:
-        backend_extensions = entry_points(group=group_name)
+    backend_extensions = entry_points(group=group_name)
 
     for backend_extension in backend_extensions:
         try:
diff --git a/torch/_compile.py b/torch/_compile.py
index 33855b44b7057..697576a3de6d6 100644
--- a/torch/_compile.py
+++ b/torch/_compile.py
@@ -4,7 +4,7 @@
 """
 
 import functools
-from typing import Callable, Literal, Optional, overload, TypeVar, Union
+from typing import Callable, Optional, overload, TypeVar, Union
 from typing_extensions import ParamSpec
 
 
@@ -20,7 +20,7 @@ def _disable_dynamo(
 
 @overload
 def _disable_dynamo(
-    fn: Literal[None] = None, recursive: bool = True
+    fn: None = None, recursive: bool = True
 ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]: ...
 
 
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index ba09c6173c5f3..64750804ce8fe 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1173,6 +1173,8 @@ def native_dropout(input: Tensor, p: float, train: Optional[bool]):
 @register_decomposition(aten._softmax)
 @out_wrapper()
 def _softmax(x: Tensor, dim: int, half_to_float: bool):
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
+
     # eager softmax returns a contiguous tensor. Ensure that decomp also returns
     # a contiguous tensor.
     x = x.contiguous()
@@ -1182,7 +1184,7 @@ def _softmax(x: Tensor, dim: int, half_to_float: bool):
         x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
     )
     x = x.to(computation_dtype)
-    if x.numel() == 0:
+    if guard_or_false(x.numel() == 0):
         unnormalized = torch.exp(x)
     else:
         x_max = torch.amax(x, dim, keepdim=True)
@@ -1196,6 +1198,8 @@ def _softmax(x: Tensor, dim: int, half_to_float: bool):
 @register_decomposition(aten._log_softmax)
 @out_wrapper(exact_dtype=True)
 def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
+
     # eager log_softmax returns a contiguous tensor. Ensure that decomp also
     # returns a contiguous tensor.
     x = x.contiguous()
@@ -1205,7 +1209,7 @@ def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
         x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
     )
     x = x.to(computation_dtype)
-    if x.numel() == 0:
+    if guard_or_false(x.numel() == 0):
         shifted = x
     else:
         x_max = torch.amax(x, dim, keepdim=True)
@@ -1539,9 +1543,9 @@ def native_group_norm_backward(
         lambda: f"Expect gamma to have {C} elements but got {gamma.numel() if gamma is not None else -1}",
     )
 
-    cpg, _rem = divmod(C, group)
+    cpg = C // group
     torch._check(
-        _rem == 0,
+        C == cpg * group,
         lambda: f"Expect number of channels {C} to be evenly-divisible by number of groups {group}",
     )
 
@@ -1710,8 +1714,8 @@ def native_layer_norm_backward(
 
     return (
         _maybe_cast(d_input, input.dtype),
-        _maybe_cast(d_weight, input.dtype),
-        _maybe_cast(d_bias, input.dtype),
+        _maybe_cast(d_weight, weight.dtype if weight is not None else None),
+        _maybe_cast(d_bias, bias.dtype if bias is not None else None),
     )
 
 
diff --git a/torch/_dynamo/_trace_wrapped_higher_order_op.py b/torch/_dynamo/_trace_wrapped_higher_order_op.py
index 17b664fc5e0ed..9b000ee926a1b 100644
--- a/torch/_dynamo/_trace_wrapped_higher_order_op.py
+++ b/torch/_dynamo/_trace_wrapped_higher_order_op.py
@@ -116,6 +116,11 @@ def backward(ctx, gradOut):  # type: ignore[no-untyped-def]
             None,
         )
 
+    @classmethod
+    @torch._export.wrappers.allow_in_pre_dispatch_graph
+    def apply(cls, *args, **kwargs):  # type: ignore[no-untyped-def]
+        return super().apply(*args, **kwargs)
+
 
 mod_index = ModIndex.apply
 
diff --git a/torch/_dynamo/aot_compile.py b/torch/_dynamo/aot_compile.py
index 0482016846283..250acc965f7f0 100644
--- a/torch/_dynamo/aot_compile.py
+++ b/torch/_dynamo/aot_compile.py
@@ -1,21 +1,28 @@
 import abc
-import builtins
+import dataclasses
 import importlib
 import inspect
 import logging
 import pickle
 import types
+from contextlib import AbstractContextManager, ExitStack
 from dataclasses import dataclass
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, TYPE_CHECKING
 
 import torch
 import torch.fx
-from torch._dynamo.precompile_context import PrecompileContext
+from torch._dynamo.graph_utils import _graph_device_type
+from torch._dynamo.package import SystemInfo
 
 from . import convert_frame
 from .hooks import Hooks
 
 
+if TYPE_CHECKING:
+    from .guards import GuardManagerWrapper
+    from .package import SourceInfo
+
+
 log = logging.getLogger(__name__)
 
 
@@ -43,13 +50,20 @@ def bind_locals(
 class CompileArtifacts:
     signature: inspect.Signature
     bytecode: types.CodeType
-    guard_manager: Optional[torch._dynamo.guards.GuardManagerWrapper]
+    guard_manager: Optional["GuardManagerWrapper"]
     guards_state: bytes
     import_sources: dict[str, str]
     backend_id: str
     compiled_fn: SerializableCallable
     original_code: types.CodeType
     closure: Optional[tuple[Any, ...]]
+    source_info: "SourceInfo"
+    device_type: str
+    system_info: SystemInfo = dataclasses.field(default_factory=SystemInfo.current)
+
+    def check_compatibility(self) -> None:
+        current_system = SystemInfo.current()
+        current_system.check_compatibility(self.system_info, self.device_type)
 
 
 @dataclass
@@ -62,6 +76,8 @@ def guard_check(self, *args: Any, **kwargs: Any) -> bool:
         return self._artifacts.guard_manager.check(f_locals)
 
     def __post_init__(self) -> None:
+        self._artifacts.check_compatibility()
+
         import_sources = {
             alias: importlib.import_module(module_name)
             for alias, module_name in self._artifacts.import_sources.items()
@@ -91,6 +107,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
             raise RuntimeError(f"GuardManager check failed, reason: {reason}")
         return self.fn(*args, **kwargs)
 
+    def source_info(self) -> "SourceInfo":
+        return self._artifacts.source_info
+
     def save_compiled_function(self, path: str) -> None:
         with open(path, "wb") as f:
             f.write(type(self).serialize(self))
@@ -134,14 +153,13 @@ class BundledAOTAutogradSerializableCallable(SerializableCallable):
     We'll do that refactor in a later PR.
     """
 
-    def __init__(self, artifact: Any) -> None:
+    def __init__(self, compiled_fn: Any) -> None:
         """
         Takes in a BundledAOTAutogradCacheArtifact, which is the serialized form
         of a compiled function generated by AOTAutograd.
         """
-
-        self.compiled_fn = artifact.after_deserialization()
-        self.data = artifact.content
+        assert hasattr(compiled_fn, "serialize")
+        self.compiled_fn = compiled_fn
 
     def __getattr__(self, attr: Any) -> Any:
         if hasattr(self, attr):
@@ -149,35 +167,24 @@ def __getattr__(self, attr: Any) -> Any:
         else:
             return getattr(self.compiled_fn, attr)
 
-    @classmethod
-    def from_backend_id(
-        cls, backend_id: str
-    ) -> "BundledAOTAutogradSerializableCallable":
-        """
-        Takes in a backend_id, and returns a BundledAOTAutogradSerializableCallable
-        that wraps around the compiled function generated by AOTAutograd.
-        """
-        artifact = PrecompileContext.serialize_artifact_by_key(backend_id)
-        if artifact is None:
-            raise RuntimeError("No artifact found for backend_id: " + backend_id)
-        return cls(artifact)
-
     @classmethod
     def serialize_compile_artifacts(
         cls, fn: "BundledAOTAutogradSerializableCallable"
     ) -> bytes:
-        return fn.data
+        with torch._functorch.config.patch("bundled_autograd_cache", True):
+            result = pickle.dumps(fn.compiled_fn.serialize())
+            return result
 
     @classmethod
     def deserialize_compile_artifacts(cls, data: bytes) -> Any:
         from torch._functorch._aot_autograd.autograd_cache import (
-            BundledAOTAutogradCacheArtifact,
+            deserialize_bundled_cache_entry,
         )
 
-        # The key in the artifact is not important here since we're not populating a cache,
-        # we just want to grab the callable back out of the serialized entry
-        artifact = BundledAOTAutogradCacheArtifact("", data)
-        return cls(artifact)
+        entry = pickle.loads(data)
+
+        compiled_fn = deserialize_bundled_cache_entry(entry)
+        return cls(compiled_fn)
 
     def __call__(self, *args: Any, **kwargs: Any) -> Any:
         return self.compiled_fn(*args, **kwargs)
@@ -190,44 +197,19 @@ def aot_compile_fullgraph(
     backend: Callable[[torch.fx.GraphModule, list[torch.Tensor]], SerializableCallable],
 ) -> AOTCompiledFunction:
     from torch._dynamo.guards import CheckFunctionManager
+    from torch._dynamo.package import SourceInfo
     from torch._dynamo.utils import dynamo_timed, get_metrics_context
-    from torch._guards import compile_context, CompileContext, TracingContext
+    from torch._guards import TracingContext
 
     args, kwargs = example_inputs
-    if hasattr(model, "__self__"):
-        fn = model.__func__
-        args = (model.__self__,) + args
-    elif inspect.isfunction(model):
-        fn = model
-    else:
-        raise RuntimeError(f"Unsupported model code type {model}")
-
-    signature = inspect.signature(fn)
-    f_locals = bind_locals(signature, *args, **kwargs)
-    if fn.__code__.co_freevars or fn.__closure__:
-        assert len(fn.__closure__) == len(fn.__code__.co_freevars)
-        f_locals.update(
-            {
-                name: cell.cell_contents
-                for name, cell in zip(fn.__code__.co_freevars, fn.__closure__)
-            }
-        )
 
     with (
-        compile_context(CompileContext(convert_frame.get_compile_id({}))),
         get_metrics_context(),
         dynamo_timed("fullgraph_capture"),
     ):
-        capture_output = convert_frame.fullgraph_capture(
-            convert_frame.FrameInfo(
-                fn.__code__,
-                fn.__globals__,
-                f_locals,
-                builtins.__dict__,
-                closure=fn.__closure__ or (),  # type: ignore[arg-type]
-            )
-        )
-        dynamo_output = capture_output.dynamo_output
+        capture_output = convert_frame.fullgraph_capture(model, args, kwargs)
+        graph_capture_output = capture_output.graph_capture_output
+        assert graph_capture_output.output_graph is not None
 
         if not hooks.guard_filter_fn:
             from torch._dynamo.types import GuardFilterEntry
@@ -248,50 +230,147 @@ def new_guard_filter_fn(
 
             hooks.guard_filter_fn = new_guard_filter_fn
 
-        check_fn = dynamo_output.build_guards(
+        fn, _ = convert_frame.get_traced_fn(model)
+        check_fn = graph_capture_output.build_guards(
             fn.__code__, hooks=hooks, save=True, strict_error=True
         )
 
         assert check_fn.guards_state is not None
 
-    backend_input = capture_output.backend_input
-    backend_input.graph_module._backend_id = backend_input.backend_id  # type: ignore[assignment]
-    output_graph = dynamo_output.tracer_output.output_graph
-    assert output_graph is not None
-    import_sources = output_graph.import_sources
-    with (
-        torch._guards.tracing(TracingContext(backend_input.fake_mode)),
-        torch._functorch.config.patch("bundled_autograd_cache", True),
-    ):
-        compiled_fn = backend(backend_input.graph_module, backend_input.example_inputs)
-
-    # If Inductor backend is used, grab the compiled_fn from PrecompileContext
-    # TODO: this should be replaced once we make the backend return the SerializableCallable directly.
-    if isinstance(backend, torch._TorchCompileInductorWrapper):
-        compiled_fn = BundledAOTAutogradSerializableCallable.from_backend_id(
-            backend_input.backend_id
-        )
+        backend_input = capture_output.backend_input
+        assert backend_input is not None
+        backend_input.graph_module._backend_id = backend_input.backend_id  # type: ignore[assignment]
+        device_type = _graph_device_type(backend_input.graph_module.graph)
+        with (
+            torch._guards.tracing(TracingContext(backend_input.fake_mode)),
+            torch._functorch.config.patch(
+                {
+                    "bundled_autograd_cache": True,
+                    "force_non_lazy_backward_lowering": True,
+                }
+            ),
+        ):
+            compiled_fn = backend(
+                backend_input.graph_module, backend_input.example_inputs
+            )
+            # If Inductor backend is used, grab the compiled_fn from PrecompileContext
+            # TODO: this should be replaced once we make the backend return the SerializableCallable directly.
+            if isinstance(backend, torch._TorchCompileInductorWrapper):
+                compiled_fn = BundledAOTAutogradSerializableCallable(compiled_fn)
+
+        if not isinstance(compiled_fn, SerializableCallable):
+            if hasattr(backend, "compiler_fn"):
+                compiler_fn = backend.compiler_fn
+            else:
+                compiler_fn = backend
+            raise RuntimeError(
+                f"Compiled function type {type(compiled_fn)} (produced "
+                + f"from backend {compiler_fn}) does not implement SerializableCallable."
+            )
 
-    if not isinstance(compiled_fn, SerializableCallable):
-        if hasattr(backend, "compiler_fn"):
-            compiler_fn = backend.compiler_fn
-        else:
-            compiler_fn = backend
-        raise RuntimeError(
-            f"Compiled function type {type(compiled_fn)} (produced "
-            + f"from backend {compiler_fn}) does not implement SerializableCallable."
+        source_info = SourceInfo(inlined_sources=set())
+        for traced_code in graph_capture_output.traced_code:
+            source_info.add_code(traced_code)
+
+        artifacts = CompileArtifacts(
+            signature=inspect.signature(fn),
+            bytecode=graph_capture_output.bytecode,
+            guard_manager=check_fn.guard_manager,
+            guards_state=check_fn.guards_state,
+            import_sources=graph_capture_output.import_sources,
+            backend_id=backend_input.backend_id,
+            compiled_fn=compiled_fn,
+            original_code=fn.__code__,
+            closure=fn.__closure__,
+            source_info=source_info,
+            device_type=device_type,
         )
+        aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)
 
-    artifacts = CompileArtifacts(
-        signature=signature,
-        bytecode=dynamo_output.bytecode,
-        guard_manager=check_fn.guard_manager,
-        guards_state=check_fn.guards_state,
-        import_sources=import_sources,
-        backend_id=backend_input.backend_id,
-        compiled_fn=compiled_fn,
-        original_code=fn.__code__,
-        closure=fn.__closure__,
-    )
-    aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)
     return aot_compiled_fn
+
+
+@dataclass
+class ModelInput:
+    """
+    WIP type: represents a single model input
+    Which consists of a tuple of arguments and a set of contexts in which to run the model.
+
+    For each ModelInput, we'll compile one full graph of the model, and then use the guards generated
+    to dispatch between the compiled graphs.
+
+
+    """
+
+    args: tuple[Any]
+    kwargs: dict[str, Any]
+    contexts: list[AbstractContextManager[Any]]
+
+
+@dataclass
+class AOTCompiledModel:
+    # Represents a single forward function of a model along with dispatch
+    # compiled_results is serializable. We require the model to deserialize again.
+    model: torch.nn.Module
+    compiled_results: list[AOTCompiledFunction]
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        for result in self.compiled_results:
+            if result.guard_check(self.model, *args, **kwargs):
+                return result(self.model, *args, **kwargs)
+        # All guards failed, just run one of them and throw the guard check error.
+        return self.compiled_results[0](self.model, *args, **kwargs)
+
+    def serialize(self) -> bytes:
+        data: list[bytes] = []
+        for result in self.compiled_results:
+            data.append(AOTCompiledFunction.serialize(result))
+        return pickle.dumps(data)
+
+    @classmethod
+    def deserialize(cls, model: torch.nn.Module, data: bytes) -> "AOTCompiledModel":
+        from torch._dynamo.utils import get_metrics_context
+        from torch._guards import compile_context, CompileContext
+
+        results: list[bytes] = pickle.loads(data)
+        compiled_results = []
+        for result in results:
+            with (
+                compile_context(CompileContext(convert_frame.get_compile_id({}))),
+                get_metrics_context(),
+            ):
+                compiled_results.append(AOTCompiledFunction.deserialize(result))
+        return cls(model, compiled_results)
+
+
+def aot_compile_module(
+    model: torch.nn.Module,
+    inputs: list[ModelInput],
+    hooks: Hooks,
+    backend: Callable[[torch.fx.GraphModule, list[torch.Tensor]], SerializableCallable],
+) -> AOTCompiledModel:
+    """
+    Compiles a single nn.Module with any number of inputs, and returns a compiled forward function.
+    """
+
+    def compile_single_graph(model_input: ModelInput) -> AOTCompiledFunction:
+        example_inputs = (model_input.args, model_input.kwargs)
+        orig_forward = model.forward
+        with ExitStack() as stack:
+            for ctx in model_input.contexts:
+                stack.enter_context(ctx)
+            return aot_compile_fullgraph(
+                orig_forward,
+                example_inputs,
+                hooks=hooks,
+                backend=backend,
+            )
+
+    compiled_results = []
+    for model_input in inputs:
+        log.info("Compiling input %s..", model_input)
+        compiled_results.append(compile_single_graph(model_input))
+
+    assert len(compiled_results) > 0
+
+    return AOTCompiledModel(model, compiled_results)
diff --git a/torch/_dynamo/backends/registry.py b/torch/_dynamo/backends/registry.py
index 699d82fff3f00..c6a334359d0eb 100644
--- a/torch/_dynamo/backends/registry.py
+++ b/torch/_dynamo/backends/registry.py
@@ -60,7 +60,6 @@ def my_compiler_function(fx_graph, example_inputs):
 
 import functools
 import logging
-import sys
 from collections.abc import Sequence
 from importlib.metadata import EntryPoint
 from typing import Any, Callable, Optional, Protocol, Union
@@ -174,12 +173,7 @@ def _discover_entrypoint_backends() -> None:
     from importlib.metadata import entry_points
 
     group_name = "torch_dynamo_backends"
-    if sys.version_info < (3, 10):
-        eps = entry_points()
-        eps = eps[group_name] if group_name in eps else []
-        eps_dict = {ep.name: ep for ep in eps}
-    else:
-        eps = entry_points(group=group_name)
-        eps_dict = {name: eps[name] for name in eps.names}
+    eps = entry_points(group=group_name)
+    eps_dict = {name: eps[name] for name in eps.names}
     for backend_name in eps_dict:
         _BACKENDS[backend_name] = eps_dict[backend_name]
diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
index 14a6f78bfcd48..acfb82b2c74e5 100644
--- a/torch/_dynamo/bytecode_transformation.py
+++ b/torch/_dynamo/bytecode_transformation.py
@@ -251,22 +251,6 @@ def create_rot_n(n: int) -> list[Instruction]:
         # e.g. rotate 3 is equivalent to swap 3, swap 2
         return [create_instruction("SWAP", arg=i) for i in range(n, 1, -1)]
 
-    # ROT_N does not exist in Python <= 3.9, but we can simulate it
-    if sys.version_info < (3, 10) and n >= 5:
-        """
-        0 1 2 3 4
-        [0 1 2 3 4]
-        4 3 2 1 0
-        4 [3 2 1 0]
-        4 0 1 2 3
-        """
-        return [
-            create_instruction("BUILD_TUPLE", arg=n),
-            create_instruction("UNPACK_SEQUENCE", arg=n),
-            create_instruction("BUILD_TUPLE", arg=n - 1),
-            create_instruction("UNPACK_SEQUENCE", arg=n - 1),
-        ]
-
     if n <= 4:
         return [create_instruction("ROT_" + ["TWO", "THREE", "FOUR"][n - 2])]
     return [create_instruction("ROT_N", arg=n)]
@@ -545,31 +529,7 @@ def create_print_value(value: Any) -> list[Instruction]:
     ]
 
 
-def lnotab_writer(
-    lineno: int, byteno: int = 0
-) -> tuple[list[int], Callable[[int, int], None]]:
-    """
-    Used to create typing.CodeType.co_lnotab
-    See https://github.com/python/cpython/blob/main/Objects/lnotab_notes.txt
-    This is the internal format of the line number table if Python < 3.10
-    """
-    assert sys.version_info < (3, 10)
-    lnotab: list[int] = []
-
-    def update(lineno_new: int, byteno_new: int) -> None:
-        nonlocal byteno, lineno
-        while byteno_new != byteno or lineno_new != lineno:
-            byte_offset = max(0, min(byteno_new - byteno, 255))
-            line_offset = max(-128, min(lineno_new - lineno, 127))
-            assert byte_offset != 0 or line_offset != 0
-            byteno += byte_offset
-            lineno += line_offset
-            lnotab.extend((byte_offset, line_offset & 0xFF))
-
-    return lnotab, update
-
-
-def linetable_310_writer(
+def linetable_writer(
     first_lineno: int,
 ) -> tuple[list[int], Callable[[int, int], None], Callable[[int], None]]:
     """
@@ -577,7 +537,7 @@ def linetable_310_writer(
     See https://github.com/python/cpython/blob/main/Objects/lnotab_notes.txt
     This is the internal format of the line number table for Python 3.10
     """
-    assert sys.version_info >= (3, 10) and sys.version_info < (3, 11)
+    assert sys.version_info[:2] == (3, 10)
     linetable: list[int] = []
     lineno = first_lineno
     lineno_delta = 0
@@ -799,10 +759,7 @@ def assemble(instructions: list[Instruction], firstlineno: int) -> tuple[bytes,
             for _ in range(instruction_size(inst) // 2 - 1):
                 code.extend((0, 0))
     else:
-        if sys.version_info < (3, 10):
-            lnotab, update_lineno = lnotab_writer(firstlineno)
-        else:
-            lnotab, update_lineno, end = linetable_310_writer(firstlineno)
+        lnotab, update_lineno, end = linetable_writer(firstlineno)
 
         for inst in instructions:
             if inst.starts_line is not None:
@@ -810,8 +767,7 @@ def assemble(instructions: list[Instruction], firstlineno: int) -> tuple[bytes,
             arg = inst.arg or 0
             code.extend((inst.opcode, arg & 0xFF))
 
-        if sys.version_info >= (3, 10):
-            end(len(code))
+        end(len(code))
 
     return bytes(code), bytes(lnotab)
 
@@ -903,9 +859,7 @@ def devirtualize_jumps(instructions: list[Instruction]) -> None:
             assert inst.target is not None
             target = _get_instruction_front(instructions, indexof[inst.target])
             if inst.opcode in dis.hasjabs:
-                if sys.version_info < (3, 10):
-                    inst.arg = target.offset
-                elif sys.version_info < (3, 11):
+                if sys.version_info < (3, 11):
                     # `arg` is expected to be bytecode offset, whereas `offset` is byte offset.
                     # Divide since bytecode is 2 bytes large.
                     inst.arg = int(target.offset / 2)
@@ -917,9 +871,7 @@ def devirtualize_jumps(instructions: list[Instruction]) -> None:
                 inst.arg = abs(
                     int(target.offset - inst.offset - instruction_size(inst))
                 )
-                if sys.version_info >= (3, 10):
-                    # see bytecode size comment in the absolute jump case above
-                    inst.arg //= 2
+                inst.arg //= 2
             inst.argval = target.offset
             inst.argrepr = f"to {target.offset}"
 
@@ -1558,10 +1510,7 @@ def get_code_keys() -> list[str]:
     if sys.version_info >= (3, 11):
         keys.append("co_qualname")
     keys.append("co_firstlineno")
-    if sys.version_info >= (3, 10):
-        keys.append("co_linetable")
-    else:
-        keys.append("co_lnotab")
+    keys.append("co_linetable")
     if sys.version_info >= (3, 11):
         # not documented, but introduced in https://github.com/python/cpython/issues/84403
         keys.append("co_exceptiontable")
@@ -1618,11 +1567,8 @@ def clean_and_assemble_instructions(
 
     remove_extra_line_nums(instructions)
     bytecode, lnotab = assemble(instructions, code_options["co_firstlineno"])
-    if sys.version_info < (3, 10):
-        code_options["co_lnotab"] = lnotab
-    else:
-        code_options["co_linetable"] = lnotab
 
+    code_options["co_linetable"] = lnotab
     code_options["co_code"] = bytecode
     code_options["co_stacksize"] = stacksize_analysis(instructions)
     assert set(keys) - {"co_posonlyargcount"} == set(code_options.keys()) - {
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 686f0945179f3..438af14886bbf 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -95,6 +95,7 @@
 )
 from .eval_frame import (
     always_optimize_code_objects,
+    Constraint,
     dynamo_tls,
     skip_code,
     TorchPatcher,
@@ -121,7 +122,7 @@
     GuardedCode,
 )
 from .hooks import Hooks
-from .output_graph import DynamoTracerOutput
+from .output_graph import DynamoTracerOutput, OutputGraphCommon
 from .pgo import log_frame_dynamic_whitelist, put_code_state
 from .replay_record import ExecutionRecord
 from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
@@ -296,7 +297,8 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
             torch_rng_state = torch.random.get_rng_state()
             cuda_rng_state = None
             if torch.cuda.is_available():
-                cuda_rng_state = torch.cuda.get_rng_state()
+                with torch._C.DisableTorchFunction():
+                    cuda_rng_state = torch.cuda.get_rng_state()
             cuda_matmul_fp32_prec = torch._C._get_fp32_precision_getter(
                 "cuda", "matmul"
             )
@@ -330,7 +332,8 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
                 if prior_mobile_allocator_state != curr_mobile_allocator_state:
                     torch._C._unset_default_mobile_cpu_allocator()
                 if cuda_rng_state is not None:
-                    torch.cuda.set_rng_state(cuda_rng_state)
+                    with torch._C.DisableTorchFunction():
+                        torch.cuda.set_rng_state(cuda_rng_state)
                 torch._C._set_fp32_precision_setter(
                     "cuda", "matmul", cuda_matmul_fp32_prec
                 )
@@ -852,10 +855,11 @@ def build_guards(
         cache_entry: Optional[CacheEntry] = None,
         strict_error: bool = False,
     ) -> CheckFunctionManager:
-        assert self.tracer_output.output_graph is not None
+        output_graph = self.tracer_output.output_graph
+        assert output_graph is not None
         return CheckFunctionManager(
             code,
-            self.tracer_output.output_graph,
+            output_graph,
             cache_entry,
             hooks.guard_fail_fn if hooks else None,
             hooks.guard_filter_fn if hooks else None,
@@ -863,6 +867,21 @@ def build_guards(
             strict_error=strict_error,
         )
 
+    def graph_capture_output(self) -> GraphCaptureOutput:
+        output_graph = self.tracer_output.output_graph
+        assert output_graph is not None
+        return GraphCaptureOutput(
+            OutputGraphCommon(
+                output_graph.dump_guards_state(),
+                output_graph.shape_env,
+                output_graph.export_metadata,
+                output_graph.tracked_fakes_id_to_source,
+            ),
+            output_graph.import_sources,
+            output_graph.traced_code,
+            self.bytecode,
+        )
+
 
 @dataclass
 class BackendInput:
@@ -881,6 +900,36 @@ class BackendInput:
     fake_mode: torch._subclasses.fake_tensor.FakeTensorMode
 
 
+@dataclass
+class GraphCaptureOutput:
+    """
+    Minimal version of DynamoOutput
+    """
+
+    output_graph: OutputGraphCommon
+    import_sources: dict[str, str]
+    traced_code: list[CodeType]
+    bytecode: CodeType
+
+    def build_guards(
+        self,
+        code: types.CodeType,
+        hooks: Optional[Hooks] = None,
+        save: bool = False,
+        cache_entry: Optional[CacheEntry] = None,
+        strict_error: bool = False,
+    ) -> CheckFunctionManager:
+        return CheckFunctionManager(
+            code,
+            self.output_graph,
+            cache_entry,
+            hooks.guard_fail_fn if hooks else None,
+            hooks.guard_filter_fn if hooks else None,
+            save_guards=save,
+            strict_error=strict_error,
+        )
+
+
 @dataclass
 class CaptureOutput:
     """
@@ -893,39 +942,121 @@ class CaptureOutput:
     frontends.
     """
 
-    dynamo_output: DynamoOutput
-    backend_input: BackendInput
+    graph_capture_output: GraphCaptureOutput
+    # BackendInput can be None when dynamo didn't compile any graph (no tensor op)
+    backend_input: Optional[BackendInput]
 
 
-@dataclass
-class FrameInfo:
-    code: types.CodeType
-    globals: dict[str, object]
-    locals: dict[str, object]
-    builtins: dict[str, object]
-    closure: tuple[CellType]
+def get_traced_fn(mod: Any) -> tuple[FunctionType, Optional[object]]:
+    """
+    Utility function to get the function to trace, and optionally a bound self
+    object, from a callable (nn.Module, function, or method).
+    """
+    import inspect
+
+    if isinstance(mod, torch.nn.Module):
+        mod = mod.forward
+    if hasattr(mod, "__self__"):
+        return mod.__func__, mod.__self__
+    elif inspect.isfunction(mod):
+        return mod, None
+    else:
+        raise RuntimeError(f"Unsupported model code type {mod}")
+
+
+def _get_frame(
+    mod: Any,
+    args: tuple[Any, ...],
+    kwargs: Optional[dict[str, Any]] = None,
+) -> FrameInfo:
+    """
+    Create a frame to trace, given a model, args, and optional kwargs.
+    """
+    import builtins
+    import inspect
+
+    fn, self_opt = get_traced_fn(mod)
+    if self_opt is not None:
+        args = (self_opt,) + args
+    if kwargs is None:
+        kwargs = {}
+
+    signature = inspect.signature(fn)
+    bound_arguments = signature.bind(*args, **kwargs)
+    bound_arguments.apply_defaults()
+    f_locals = bound_arguments.arguments
+
+    closure = fn.__closure__ or ()
+    freevars = fn.__code__.co_freevars
+    if freevars or closure:
+        assert len(closure) == len(freevars)
+        f_locals.update(
+            {name: cell.cell_contents for name, cell in zip(freevars, closure)}
+        )
+
+    return FrameInfo(
+        fn.__code__,
+        fn.__globals__,
+        f_locals,
+        builtins.__dict__,
+        closure=fn.__closure__ or (),  # type: ignore[arg-type]
+    )
 
 
 def fullgraph_capture(
-    frame: FrameInfo, *, _is_export_deprecated_do_not_use: bool = False
+    mod: Any,
+    args: tuple[Any, ...],
+    kwargs: Optional[dict[str, Any]] = None,
+    *,
+    constraints: Optional[list[Constraint]] = None,
+    _is_export_deprecated_do_not_use: bool = False,
 ) -> CaptureOutput:
     """
-    A standalone function which takes a frame and returns dynamo captured graph
-    plus other important compile information. This should serve as the common
-    interface for different torch compiler AOT frontengs (e.g. precompile, export).
-    Note that this function doesn't apply context managers like metrics context
-    or compile id, and the expectation is that the caller will apply them depending
+    This API captures a full graph for a model, given example inputs to trace with.
+
+    Specifically, it takes a callable (nn.Module, method, or function), args, and
+    optional kwargs, and returns Dynamo-captured graph along with other important
+    compile-time information. This serves as the common graph-capture mechanism
+    for different torch compiler AOT frontends (e.g. AOT precompile, export).
+
+    Note that this API doesn't apply context managers like metrics context,
+    and the expectation is that the caller will apply them depending
     on the use case.
 
     The CaptureOutput is separated into two parts:
-    1. Dynamo specific information from DynamoOutput, which includes:
+    1. Frontend specific information, which includes:
         - guards
         - generated bytecode
-        - other information tracked by OutputGraph.
+        - other information tracked by OutputGraphCommon.
     2. Backend specific information (indexed by unique backend id) such as:
         - fx graph
         - example inputs
     """
+    frame = _get_frame(mod, args, kwargs)
+
+    with compile_context(CompileContext(get_compile_id({}))):
+        return _fullgraph_capture_frame(
+            frame,
+            constraints=constraints,
+            _is_export_deprecated_do_not_use=_is_export_deprecated_do_not_use,
+        )
+
+
+@dataclass
+class FrameInfo:
+    code: types.CodeType
+    globals: dict[str, object]
+    locals: dict[str, object]
+    builtins: dict[str, object]
+    closure: tuple[CellType]
+
+
+def _fullgraph_capture_frame(
+    frame: FrameInfo,
+    *,
+    constraints: Optional[list[Constraint]] = None,
+    _is_export_deprecated_do_not_use: bool = False,
+) -> CaptureOutput:
     from torch._guards import TracingContext
 
     backend_input: Optional[BackendInput] = None
@@ -951,6 +1082,7 @@ def fullgraph_compiler(
             frame.closure,
             compiler_fn=fullgraph_compiler,
             export=_is_export_deprecated_do_not_use,
+            export_constraints=constraints,  # type: ignore[arg-type]
             one_graph=True,
             restart_reasons=set(),
         )
@@ -966,8 +1098,10 @@ def fullgraph_compiler(
             cur_exn = cur_exn.__cause__
         raise e.with_traceback(None) from e.__cause__  # User compiler error
 
-    assert backend_input is not None
-    return CaptureOutput(dynamo_output, backend_input)
+    return CaptureOutput(
+        dynamo_output.graph_capture_output(),
+        backend_input,
+    )
 
 
 def compile_frame(  # type: ignore[return]
@@ -1259,6 +1393,7 @@ def count_args(code: CodeType) -> int:
             assert check_fn.guards_state is not None
             package.add_guarded_code(check_fn.guards_state, out_code)
             package.add_inlined_source(output.tracing_context.traced_code)
+            package.update_device_type(output.current_tracer.graph)
 
         compile_id_str = str(compile_id) if compile_id is not None else "Unknown"
         annotation_str = "Torch-Compiled Region: " + compile_id_str
@@ -1429,6 +1564,7 @@ def format_func_info(code: CodeType) -> str:
         fail_user_frame_lineno: Optional[int] = None
         torch._dynamo.utils.ReinplaceCounters.clear()
         guarded_code = None
+        tracer_output = None
         try:
             guarded_code, tracer_output = compile_inner(code, one_graph, hooks)
 
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index 8143a31608d57..bb66e79b65570 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -536,19 +536,26 @@ class directly; instead, use :func:`mark_dynamic`.
 def mark_unbacked(
     t: Any,
     index: Union[int, list[Any], tuple[Any]],
+    hint_override: Optional[int] = None,
     strict: bool = False,
     specialize_on: Optional[list[Any]] = None,
 ) -> None:
     """
-    Mark a tensor as having an unbacked dim.  This changes the semantics of operations,
-    we will always report the size does not equal zero/one, we will turn asserts
-    on this index into runtime asserts, and if you try to get the real value we will
-    raise an exception.  In other words, we will treat this dimension as if it was
-    data dependent (we do not know anything about its value.)
+    Mark a tensor as having an unbacked dimension. This changes the semantics of operations:
+    - The size of the specified dimension will always be reported as not equal to zero or one.
+    - Assertions on this index will be turned into runtime asserts.
+    - Attempting to get the real value of this dimension will raise an exception.
+    - In effect, this dimension is treated as data-dependent (its value is unknown).
 
-    For historical reasons, by default if an unbacked dim is specialized, we will
-    happily specialize it and continue. If you want to error in these cases, pass
-    strict=True.
+    Args:
+        t (Any): The tensor to mark as having an unbacked dimension.
+        index (int or list/tuple of int): The dimension(s) to mark as unbacked. Can be a single integer or a list/tuple of integers.
+        hint_override (Optional[int], default=None): An optional integer to override the size hint for this dimension.
+            This is only used by the inductor backend for size hint queries, such as during autotuning.
+        strict (bool, default=False): If True, an error will be raised if the unbacked dimension is specialized.
+            By default (strict=False), specialization is allowed and will proceed without error.
+        specialize_on (Optional[list[Any]], default=None): A list of specialization criteria (e.g., lambdas) for this dimension.
+            If provided, Dynamo will generate specialized compiled regions for each criterion in addition to a generic trace.
     """
     # You could have copied the mark_dynamic behavior but I'm not convinced
     # it's what you want
@@ -567,6 +574,12 @@ def mark_unbacked(
         if not hasattr(t, "_dynamo_unbacked_indices"):
             t._dynamo_unbacked_indices = set()
 
+        if not hasattr(t, "_dynamo_hint_overrides"):
+            t._dynamo_hint_overrides = {}
+
+        if hint_override:
+            t._dynamo_hint_overrides[index] = hint_override
+
         # FX tracers don't respect @forbid_in_graph and choke on the following error since it passes in proxies:
         # TypeError: 'Attribute' object does not support item assignment
         if isinstance(t._specialize_on, dict):
@@ -612,7 +625,10 @@ def mark_dynamic(
     4) Attempts to trace this function will explicitly raise. As such, all calls to mark_dynamic must be made
     before torch.compile.
 
-    5) If specialize_on is passed in, we will perform a single generic Dynamo trace followed by
+    5) If hint_override is passed, the hint_override for the specified dimension will replace the provided value
+    from the first example input as the official size hint.
+
+    6) If specialize_on is passed in, we will perform a single generic Dynamo trace followed by
     multiple specialized compilations in addition to a single generic compilation. NB: For now we only support
     per dimension specialization, or in other words we do not generate a cross product of specializations.
     At runtime, we will dispatch to a specialized compiled region if the input matches the specialization criteria.
@@ -626,6 +642,7 @@ def mark_dynamic(
     This approach results in one Dynamo trace and two backend compilations. When the input dimension equals 8 or 16
     at runtime, execution will be directed to the specialized compiled region. Performance measurements indicate
     2-8x speedups depending on the specific specialization and model architecture.
+
     """
     if is_traceable_wrapper_subclass(t):
         # default behavior: mirror mark_dynamic() on all inner tensors with same dim as t
@@ -752,12 +769,13 @@ def mark_static(
 
 
 @forbid_in_graph
-def mark_static_address(t: Any, guard: bool = True) -> None:
+def mark_static_address(t: Any, guard: bool = False) -> None:
     """
-    Marks an input tensor whose data_ptr will not change across multiple calls
-    to a dynamo-compiled function. This indicates to cudagraphs that an extra allocation
-    is not needed for this input. The data_ptr will be guarded if guard=True. Note:
-    Tensors marked in this way will be kept alive until `torch._dynamo.reset()` is called.
+    Marks an input tensor whose address should be treated as constant across calls to the
+    same dynamo-compiled function. This indicates to cudagraphs that an extra allocation
+    is not needed for this input. The data_ptr will be guarded if guard=True, and cause a full
+    recompile if the data_ptr changes. Note: If this address changes, cudagraphs will re-record
+    if guard=False.
     """
     if not isinstance(t, torch.Tensor):
         raise TypeError(f"mark_static_address expects a tensor but received {type(t)}")
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 177541e8f3341..e7e134d27bdf4 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -67,7 +67,7 @@
 from torch._dynamo.types import ConvertFrameReturn, FrameAction, FrameExecStrategy
 from torch._export.utils import _compiling_state_context
 from torch._subclasses.fake_tensor import unset_fake_temporarily
-from torch._utils_internal import justknobs_check, log_export_usage
+from torch._utils_internal import DISABLE_JUSTKNOBS, justknobs_check, log_export_usage
 from torch.export.dynamic_shapes import (
     _combine_args,
     _DimHint,
@@ -145,16 +145,20 @@ class Unset(Enum):
 unset = Unset.token
 
 
-def _maybe_set_eval_frame(callback: DynamoCallback) -> DynamoCallback:
-    # A wrapper on set_eval_frame that is guarded by a Justknob.
-    # Users can disable torchDynamo by setting the JK to False.
-    if not justknobs_check("pytorch/compiler:enable_compiler_set_eval_frame"):
-        torch._dynamo.utils.warn_once(
-            "Dynamo disabled by Justknob: enable_compiler_set_eval_frame, skipping set_eval_frame"
-        )
-        return callback
-    else:
-        return set_eval_frame(callback)
+if DISABLE_JUSTKNOBS:
+    _maybe_set_eval_frame = set_eval_frame
+else:
+
+    def _maybe_set_eval_frame(callback: DynamoCallback) -> DynamoCallback:
+        # A wrapper on set_eval_frame that is guarded by a Justknob.
+        # Users can disable torchDynamo by setting the JK to False.
+        if not justknobs_check("pytorch/compiler:enable_compiler_set_eval_frame"):
+            torch._dynamo.utils.warn_once(
+                "Dynamo disabled by Justknob: enable_compiler_set_eval_frame, skipping set_eval_frame"
+            )
+            return callback
+        else:
+            return set_eval_frame(callback)
 
 
 @dataclass
@@ -413,6 +417,57 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
             )
         return super().__call__(*args, **kwargs)
 
+    def _aot_compile(self, inputs: list[torch._dynamo.aot_compile.ModelInput]) -> None:
+        """
+        Experimental: AOT Compile a set of inputs and use that as the forward function
+        """
+        model = self._orig_mod
+        hooks = self.dynamo_ctx._hooks
+        assert hooks is not None
+        if not config.enable_aot_compile:
+            raise RuntimeError(
+                "AOT Compile is not enabled, please set torch._dynamo.config.enable_aot_config=True"
+            )
+        if not self.dynamo_ctx.fullgraph:
+            raise RuntimeError(
+                "Graph breaks are not supported with aot compile. Please use torch.compile(fullgraph=True)."
+            )
+
+        if not callable(self.dynamo_ctx.callback):
+            raise RuntimeError("aot compile requires a callable dynamo callback.")
+
+        backend = innermost_fn(
+            self.dynamo_ctx.callback, unaltered_fn_attr="_torchdynamo_orig_backend"
+        )
+        from torch._dynamo.aot_compile import aot_compile_module
+
+        self.forward = aot_compile_module(model, inputs, hooks, backend)
+
+    def _save_aot_compiled_module(self, path: Optional[str] = None) -> bytes:
+        if not config.enable_aot_compile:
+            raise RuntimeError(
+                "AOT Compile is not enabled, please set torch._dynamo.config.enable_aot_config=True"
+            )
+        from torch._dynamo.aot_compile import AOTCompiledModel
+
+        assert isinstance(self.forward, AOTCompiledModel)
+        result: bytes = self.forward.serialize()
+        if path is not None:
+            with open(path, "wb") as f:
+                f.write(result)
+        return result
+
+    def _load_aot_compiled_module(self, data: bytes) -> None:
+        if not config.enable_aot_compile:
+            raise RuntimeError(
+                "AOT Compile is not enabled, please set torch._dynamo.config.enable_aot_config=True"
+            )
+        from torch._dynamo.aot_compile import AOTCompiledModel
+
+        compiled_forward = AOTCompiledModel.deserialize(self._orig_mod, data)
+        assert isinstance(compiled_forward, AOTCompiledModel)
+        self.forward = compiled_forward
+
     def __reduce__(
         self,
     ) -> tuple[type[OptimizedModule], tuple[torch.nn.Module, _TorchDynamoContext]]:
@@ -692,12 +747,11 @@ def get_compiler_config() -> Any:
                     # Create a fresh CompilePackage
                     self._package.initialize(fn, None, ignore_inlined_sources=False)
                 else:
-                    cache_entry, backends = result
                     try:
                         self._package.initialize(
-                            fn, cache_entry, ignore_inlined_sources=False
+                            fn, result.dynamo, ignore_inlined_sources=False
                         )
-                        self._package.install(backends)
+                        self._package.install(result.backends)
                     except RuntimeError as e:
                         log.warning("Failed to load entry from dynamo cache: %s", e)
                         self._package.initialize(fn, None, ignore_inlined_sources=False)
diff --git a/torch/_dynamo/functional_export.py b/torch/_dynamo/functional_export.py
index 228dd7924aa3a..f1d3f4f5c5f8c 100644
--- a/torch/_dynamo/functional_export.py
+++ b/torch/_dynamo/functional_export.py
@@ -1,17 +1,137 @@
-import builtins
 import inspect
+import logging
+import traceback
 from collections import namedtuple
-from typing import Any, Callable
+from typing import Any, Callable, Optional, Union
+
+import sympy
 
 import torch
+import torch.fx
 import torch.utils._pytree as pytree
-from torch._dynamo.convert_frame import FrameInfo, fullgraph_capture, get_compile_id
+from torch._dynamo.convert_frame import fullgraph_capture, get_traced_fn
 from torch._dynamo.eval_frame import argument_names
 from torch._dynamo.utils import dynamo_timed, get_metrics_context
-from torch._guards import compile_context, CompileContext
+from torch._export.utils import _compiling_state_context
+from torch.export.dynamic_shapes import _RelaxedConstraint, Constraint
+from torch.fx import Node
+from torch.fx.experimental.symbolic_shapes import (
+    ConstraintViolationError,
+    DimDynamic,
+    StatelessSymbolicContext,
+)
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 
 
+log = logging.getLogger(__name__)
+
+
+def post_process_error_msg(
+    constraint_violation_error: ConstraintViolationError,
+    mod: Callable[..., Any],
+    args: Any,
+    kwargs: Any,
+):
+    """
+    Because we trace a different callable, the sources are all messed up.
+    Manually patch them so the error message looks correct.
+    """
+    from torch.export._unlift import _get_input_paths, _replace_sources
+
+    assert isinstance(mod, torch.nn.Module)
+    orig_sig = inspect.signature(mod.forward)
+    flat_input_paths = _get_input_paths((args, kwargs), orig_sig)
+    constraint_violation_error.args = (
+        _replace_sources(constraint_violation_error.args[0], flat_input_paths),
+    )
+    return constraint_violation_error
+
+
+def clean_nn_module_stack(
+    graph_module: torch.fx.GraphModule, is_inline_builtin=False
+) -> torch.fx.GraphModule:
+    """
+    Clean up nn_module_stack metadata by removing export_root references.
+
+    Removes the _export_root module references from nn_module_stack metadata
+    in graph nodes, which are artifacts from the export process. Fixes two patterns:
+
+    1. Keys: Removes "__export_root_" and "__modules['_export_root']_" prefixes
+       - Normal case: "L__self____export_root_child" -> "L__self__child"
+       - inline_builtin case: Uses numeric ID strings like "140468831433840"
+
+    2. Values: Removes "._export_root" and "._modules['_export_root']" from child names
+       e.g., "L['self']._export_root.child" -> "L['self'].child"
+       e.g., "L['self']._modules['_export_root'].child" -> "L['self'].child"
+
+    Also removes the root export entry "L__self____export_root" entirely.
+
+    Args:
+        graph_module: The GraphModule to clean up
+        is_inline_builtin: If True, keys are numeric ID strings and self references
+                          (L['self']) are filtered out
+
+    Returns:
+        The cleaned GraphModule (modified in-place)
+    """
+    for node in graph_module.graph.nodes:
+        if "nn_module_stack" not in node.meta:
+            continue
+
+        nn_module_stack = node.meta["nn_module_stack"].copy()
+
+        if "L__self____export_root" in nn_module_stack:
+            del nn_module_stack["L__self____export_root"]
+
+        # Clean up remaining entries
+        cleaned_stack = {}
+        for key, (child_name, child_class) in nn_module_stack.items():
+            # Clean key by removing export_root patterns
+            clean_key = key.replace("__modules['_export_root']_", "").replace(
+                "__export_root_", ""
+            )
+
+            # Clean child_name by removing export_root patterns
+            clean_name = child_name.replace("._modules['_export_root']", "").replace(
+                "._export_root", ""
+            )
+
+            # Skip self reference for inline builtin case
+            if is_inline_builtin and clean_name == "L['self']":
+                continue
+
+            cleaned_stack[clean_key] = (clean_name, child_class)
+
+        node.meta["nn_module_stack"] = cleaned_stack
+
+    return graph_module
+
+
+def clean_export_root(graph_module: torch.fx.GraphModule) -> None:
+    """Remove export_root artifacts from FX graph in-place"""
+
+    # Clean parameter names: L__self____export_root_param -> L__self___param
+    def clean_name(name) -> str:
+        if "____modules___export_root_" in name:
+            return name.replace("____modules___export_root_", "_")
+        if "__export_root_" in name:
+            return name.replace("__export_root_", "_")
+        return name
+
+    # Update get_attr nodes in-place
+    for node in graph_module.graph.nodes:
+        if node.op == "get_attr":
+            old_target = node.target
+            new_target = clean_name(old_target)
+            if new_target != old_target:
+                node.target = new_target
+                # Move the parameter to the new name
+                if hasattr(graph_module, old_target):
+                    param = torch.fx.graph_module._get_attr(graph_module, old_target)
+                    torch.fx.graph_module._assign_attr(param, graph_module, new_target)
+                    torch.fx.graph_module._del_attr(graph_module, old_target)
+
+
 class ModuleToTrace(torch.nn.Module):
     def __init__(self, foo: Any, in_spec: Any) -> None:
         super().__init__()
@@ -28,115 +148,356 @@ def forward(self, *flat_args: Any) -> "ExportTracerOutput":
 ExportTracerOutput = namedtuple("ExportTracerOutput", ["flat_args", "out_spec"])
 
 
+# mypy: disable-error-code="no-untyped-def,var-annotated,assignment,index,operator"
+class DynamoGraphTransformer(torch.fx.Transformer):
+    """Graph transformer for dynamo export that flattens inputs/outputs without complex matching."""
+
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        flat_inputs: list[Any],
+        flat_args_dynamic_dims: list[set[int]],
+        graph_input_order: dict[int, int],
+        graph_output_map: dict[int, tuple[str, Any]],
+        fake_mode: Optional[Any] = None,
+    ) -> None:
+        super().__init__(module)
+
+        assert len(flat_args_dynamic_dims) == len(flat_inputs)
+
+        self.flat_inputs = flat_inputs
+        self.flat_args_dynamic_dims = flat_args_dynamic_dims
+        self.graph_input_order = graph_input_order
+        self.graph_output_map = graph_output_map
+        self.fake_mode = fake_mode
+
+        # Get original placeholders and output
+        self.placeholders = [n for n in module.graph.nodes if n.op == "placeholder"]
+        self.output_node = next(n for n in module.graph.nodes if n.op == "output")
+
+        # Create new flattened input placeholders
+        self.new_input_nodes: dict[int, torch.fx.Node] = {}
+        self._create_flattened_inputs()
+
+        # Iterator for replacing old placeholders
+        self.old_to_new_mapping = {}
+        self._create_placeholder_mapping()
+
+    def _create_flattened_inputs(self) -> None:
+        """Create new placeholder nodes for flattened inputs with proper fake tensors."""
+        for i in range(len(self.flat_inputs)):
+            placeholder = super().placeholder(f"arg_{i}", (), {})
+
+            # Check if this user input (index i) maps to a graph placeholder
+            if i in self.graph_input_order:
+                # graph_input_order[i] gives us which graph placeholder this user input corresponds to
+                graph_placeholder_idx = self.graph_input_order[i]
+                if graph_placeholder_idx < len(self.placeholders):
+                    orig_placeholder = self.placeholders[graph_placeholder_idx]
+                    # Copy other metadata but not "val" yet
+                    for key, value in orig_placeholder.meta.items():
+                        if key != "val":
+                            placeholder.node.meta[key] = value
+
+            # Always ensure we have proper "val" metadata from fake tensor
+            if self.fake_mode is not None and isinstance(
+                self.flat_inputs[i], torch.Tensor
+            ):
+                placeholder.node.meta["val"] = self.fake_mode.from_tensor(
+                    self.flat_inputs[i],
+                    symbolic_context=StatelessSymbolicContext(
+                        dynamic_sizes=[
+                            (
+                                DimDynamic.DYNAMIC
+                                if d in self.flat_args_dynamic_dims[i]
+                                else DimDynamic.STATIC
+                            )
+                            for d in range(len(self.flat_inputs[i].shape))
+                        ],
+                        constraint_sizes=[None] * len(self.flat_inputs[i].shape),
+                    ),
+                )
+            elif hasattr(self.flat_inputs[i], "val"):  # _IntWrapper case
+                placeholder.node.meta["val"] = self.flat_inputs[i].val
+            else:
+                placeholder.node.meta["val"] = self.flat_inputs[i]
+
+            self.new_input_nodes[i] = placeholder
+
+    def _create_placeholder_mapping(self) -> None:
+        """Create mapping from old placeholders to new ones."""
+        # graph_input_order maps: user_input_index -> graph_placeholder_index
+        # We need to create: old_graph_placeholder -> new_user_input_placeholder
+        for user_input_idx, graph_placeholder_idx in self.graph_input_order.items():
+            if graph_placeholder_idx < len(self.placeholders):
+                old_placeholder = self.placeholders[graph_placeholder_idx]
+                new_placeholder = self.new_input_nodes[user_input_idx]
+                self.old_to_new_mapping[old_placeholder] = new_placeholder
+
+    def placeholder(self, target, args, kwargs) -> Any:
+        """Replace old placeholders with new flattened ones."""
+        # Return the corresponding new placeholder
+        if self.current_node in self.old_to_new_mapping:
+            new_arg = self.old_to_new_mapping[self.current_node]
+
+            # Copy over additional metadata from current node, but don't overwrite "val"
+            for key in ["tensor_dict", "example_value", "unbacked_bindings"]:
+                if key in self.current_node.meta:
+                    new_arg.node.meta[key] = self.current_node.meta[key]
+
+            # Only copy "val" if we don't already have a good one
+            if "val" in self.current_node.meta and "val" not in new_arg.node.meta:
+                new_arg.node.meta["val"] = self.current_node.meta["val"]
+
+            return new_arg
+        else:
+            # Shouldn't happen if mapping is correct, but fallback
+            return super().placeholder(target, args, kwargs)
+
+    def output(self, target, args, kwargs) -> Any:
+        """Transform output according to graph_output_map."""
+        original_outputs = args[0]
+
+        # Build new output list based on graph_output_map
+        new_outputs = []
+        for i in sorted(self.graph_output_map.keys()):
+            output_type, val = self.graph_output_map[i]
+
+            if output_type == "graph_out":
+                new_outputs.append(original_outputs[val])
+            elif output_type == "input":
+                input_idx = val.index
+                new_outputs.append(self.new_input_nodes[input_idx])
+            elif output_type == "constant":
+                new_outputs.append(val)
+
+        return super().output(target, (tuple(new_outputs),), {})
+
+    def run_node(self, node: Node) -> Any:
+        """Run node transformation and preserve metadata."""
+        self.current_node = node
+        result = super().run_node(node)
+
+        # Copy important metadata
+        if hasattr(result, "node") and result.node is not node:
+            for key in ["val", "example_value", "unbacked_bindings"]:
+                if key in node.meta:
+                    result.node.meta[key] = node.meta[key]
+
+            # Preserve node names (except output)
+            if node.op != "output" and hasattr(node, "name"):
+                result.node._rename(node.name)
+
+        return result
+
+    def transform(self) -> torch.fx.GraphModule:
+        """Perform the graph transformation and copy module metadata."""
+        result_gm = super().transform()
+
+        # Copy module metadata like the original implementation
+        if hasattr(self.module, "meta"):
+            if "dynamo_flat_name_to_original_fqn" in self.module.meta:
+                result_gm.meta["dynamo_flat_name_to_original_fqn"] = self.module.meta[
+                    "dynamo_flat_name_to_original_fqn"
+                ]
+            if "dynamo_compile_id" in self.module.meta:
+                result_gm.meta["dynamo_compile_id"] = self.module.meta[
+                    "dynamo_compile_id"
+                ]
+
+        return result_gm
+
+
 def _dynamo_graph_capture_for_export(
-    mod: torch.nn.Module,
+    mod: Callable[..., Any],
+    *,
+    constraints: Optional[list[Constraint]] = None,
+    dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
 ) -> Callable[..., torch.fx.GraphModule]:
     """
-    This is lower level API that is used for export to capture dynamo level
-    torch IR.
+    Improved dynamo graph capture using transformer approach with proper fake tensor handling.
+
+    This function creates a capture instance that handles:
+    1. PyTree flattening/unflattening with proper input ordering
+    2. Dynamo graph capture with export-specific context
+    3. FX graph transformation for export compatibility
+    4. Proper fake tensor metadata preservation
+    5. Dynamic dimension constraint handling
 
-    Notable TODOs:
+    Notable improvements over manual approach:
+    - Uses FX Transformer for cleaner graph manipulation
+    - Properly handles fake tensor metadata and dynamic dimensions
+    - Preserves all necessary metadata for export
+    - More robust error handling and edge case management
+
+    TODO:
     1. Are we actually gonna run the bytecode?
     2. Need to attach guards
     """
 
+    _dynamic_shapes = dynamic_shapes
+    _constraints = constraints
+
     def inner(*args: Any, **kwargs: Any) -> torch.fx.GraphModule:
-        flat_inputs, in_spec = pytree.tree_flatten((args, kwargs))
-        module_to_trace = ModuleToTrace(mod, in_spec)
+        # This sets the is_exporting flag when building guards.
+        with _compiling_state_context():
+            flat_inputs, in_spec = pytree.tree_flatten((args, kwargs))
+            module_to_trace = ModuleToTrace(mod, in_spec)
 
-        signature = inspect.signature(module_to_trace.forward)
+            constraints: Optional[list[Constraint]] = _constraints
+            dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = (
+                _dynamic_shapes
+            )
 
-        bound_arguments = signature.bind(*flat_inputs)
-        bound_arguments.apply_defaults()
+            from . import reset  # type: ignore[attr-defined]
 
-        f_locals = {"self": module_to_trace, **bound_arguments.arguments}
+            reset()
 
-        frame = FrameInfo(
-            module_to_trace.forward.__func__.__code__,  # type: ignore[attr-defined]
-            module_to_trace.forward.__func__.__globals__,  # type: ignore[attr-defined]
-            f_locals,
-            builtins,  # type: ignore[arg-type]
-            closure=(),  # type: ignore[arg-type]
-        )
+            dynamo_config_ctx = torch._dynamo.config.patch(
+                specialize_int=True,
+                specialize_float=True,
+                assume_static_by_default=True,
+                automatic_dynamic_shapes=False,
+                capture_dynamic_output_shape_ops=True,
+                capture_scalar_outputs=True,
+                log_graph_in_out_metadata=True,
+            )
 
-        dynamo_config_ctx = torch._dynamo.config.patch(
-            "log_graph_in_out_metadata", True
-        )
+            with (
+                get_metrics_context(),
+                dynamo_timed("fullgraph_capture"),
+                dynamo_config_ctx,
+            ):
+                out = fullgraph_capture(
+                    module_to_trace,
+                    tuple(flat_inputs),
+                    constraints=_constraints,
+                    _is_export_deprecated_do_not_use=True,
+                )
 
-        with (
-            compile_context(CompileContext(get_compile_id({}))),
-            get_metrics_context(),
-            dynamo_timed("fullgraph_capture"),
-            dynamo_config_ctx,
-        ):
-            out = fullgraph_capture(frame, _is_export_deprecated_do_not_use=True)
+                assert out.graph_capture_output.output_graph is not None
 
-            assert out.dynamo_output.tracer_output.output_graph is not None
+                # Extract export metadata from the new location
+                export_metadata = out.graph_capture_output.output_graph.export_metadata
+                graph_inputs = export_metadata.graph_input_idx_to_local_source
+                graph_output_map = export_metadata.output_return_type
+                out_spec = export_metadata.out_spec
+                module_call_spec = export_metadata.module_call_spec
 
-            export_metadata = (
-                out.dynamo_output.tracer_output.output_graph.export_metadata
-            )
-            graph_inputs = export_metadata.graph_input_idx_to_local_source
-            output_return_type = export_metadata.output_return_type
-            # We need to extract out_spec here because we are not actually running the bytecode
-            out_spec = export_metadata.out_spec
+            example_inputs: list[Any] = []
+            if out.backend_input is not None:
+                graph = out.backend_input.graph_module
+                fake_mode = out.backend_input.fake_mode
+                example_inputs = out.backend_input.example_inputs
+            else:
+                graph = torch.fx.GraphModule(torch.nn.Module(), torch.fx.Graph())
+                graph.graph.output(None)
+                graph.recompile()
+                fake_mode = None
 
-            graph = out.backend_input.graph_module
+            # Compute dynamic dimensions for each input based on constraints
+            flat_args_dynamic_dims = [
+                {
+                    c.dim
+                    for c in (constraints or ())
+                    if (
+                        c.t_id == id(x)
+                        and not isinstance(c, _RelaxedConstraint)
+                        and c.constraint_range.vr.lower != c.constraint_range.vr.upper
+                    )
+                }
+                for x in flat_inputs
+            ]
 
-            # It is not guaranteed that dynamo puts inputs in right order, so we need to
-            # map the actual user order to the dynamo order.
+            # Create input order mapping from dynamo's internal order to user order
             graph_input_order: dict[int, int] = {}
             for inp in graph_inputs:
                 source = graph_inputs[inp]
                 assert isinstance(source, torch._dynamo.source.GetItemSource)
                 graph_input_order[source.index] = len(graph_input_order)
 
-            placeholders = [n for n in list(graph.graph.nodes) if n.op == "placeholder"]
-            output = next(n for n in list(graph.graph.nodes) if n.op == "output")
-            # Sometimes there can be empty inputs
-            anchor = placeholders[0] if len(placeholders) > 0 else output
-            inp_to_node = {}
-
-            with graph.graph.inserting_before(anchor):
-                for i in range(len(flat_inputs)):
-                    node_new = graph.graph.placeholder(f"arg_{i}")
-                    if i in graph_input_order:
-                        placeholders[graph_input_order[i]]
-                        node_new.meta = placeholders[graph_input_order[i]].meta.copy()
-                    inp_to_node[i] = node_new
-
-            new_args = []
-            for i in output_return_type:
-                type, val = output_return_type[i]
-                if type == "graph_out":
-                    new_args.append(output.args[0][val])
-                if type == "input":
-                    input_idx = val.index
-                    new_args.append(inp_to_node[input_idx])
-                if type == "constant":
-                    new_args.append(val)
-            output.args = (tuple(new_args),)
-
-            for src_idx, i in graph_input_order.items():
-                old = placeholders[src_idx]
-                new = inp_to_node[i]
-                old.replace_all_uses_with(new)
-                graph.graph.erase_node(old)
-
-            # Dynamo uses _lazyGraphModule, so we need to force recompile
-            from torch.fx._lazy_graph_module import _LazyGraphModule
-
-            _LazyGraphModule.force_recompile(graph)
-
-        graph.graph._codegen = _PyTreeCodeGen(
-            _PyTreeInfo(
-                argument_names(signature, args, kwargs),  # type: ignore[arg-type]
-                in_spec,
-                out_spec,
+            for real_idx, graph_idx in graph_input_order.items():
+                flat_inputs[real_idx] = example_inputs[graph_idx]
+
+            # Use FX transformer to rebuild the graph cleanly
+            transformed_graph = DynamoGraphTransformer(
+                graph,
+                flat_inputs,
+                flat_args_dynamic_dims,
+                graph_input_order,
+                graph_output_map,
+                fake_mode,
+            ).transform()
+
+            # Set up PyTree codegen for proper input/output handling
+            transformed_graph.graph._codegen = _PyTreeCodeGen(
+                _PyTreeInfo(
+                    argument_names(inspect.signature(mod.forward), args, kwargs),  # type: ignore[attr-defined, arg-type]
+                    in_spec,
+                    out_spec,
+                )
+            )
+            transformed_graph.recompile()
+
+            clean_nn_module_stack(
+                transformed_graph, torch._dynamo.config.inline_inbuilt_nn_modules
             )
-        )
+            clean_export_root(transformed_graph)
+
+            transformed_graph.meta["module_call_specs"] = module_call_spec
+
+            constraint_violation_error = None
+            try:
+                # Check if we have any constraint violations
+                fn, _ = get_traced_fn(module_to_trace)
+                out.graph_capture_output.build_guards(fn.__code__)
+            except ConstraintViolationError as e:
+                constraint_violation_error = e
+
+            if (
+                (shape_env := getattr(fake_mode, "shape_env", None)) is not None
+                and (dim_constraints := shape_env.dim_constraints) is not None
+                and not isinstance(
+                    module_to_trace.forward,
+                    (torch._ops.OpOverloadPacket, torch._ops.OpOverload),
+                )
+            ):
+                dim_constraints.solve()
+                forced_specializations = dim_constraints.forced_specializations()
+                msg = dim_constraints.prettify_results(
+                    inspect.signature(mod.forward),  # type: ignore[attr-defined]
+                    dynamic_shapes,
+                    constraint_violation_error,
+                    forced_specializations,
+                )
+                if constraint_violation_error:
+                    constraint_violation_error.args = (
+                        constraint_violation_error.args[0] + msg,
+                    )
+                else:
+                    if forced_specializations:
+                        constraint_violation_error = ConstraintViolationError(msg)
+                    else:
+                        log.info(
+                            "Summary of dimension constraints:%s",
+                            msg,
+                        )
+
+                # Error if we have any constraints on static values
+                for k in shape_env.var_to_range.keys():
+                    if isinstance(k, sympy.Integer):
+                        constraint_violation_error = ConstraintViolationError(
+                            f"{''.join(traceback.format_list(shape_env.var_to_stack[k]))}\n"
+                            "It appears that you're trying to set a constraint on a "
+                            f"value which we evaluated to have a static value of {k}. "
+                            'Set TORCH_LOGS="+export" for more information.'
+                        )
+            if constraint_violation_error:
+                constraint_violation_error = post_process_error_msg(
+                    constraint_violation_error, mod, args, kwargs
+                )
+                raise constraint_violation_error
 
-        graph.recompile()
-        return graph
+            return transformed_graph
 
     return inner
diff --git a/torch/_dynamo/graph_break_registry.json b/torch/_dynamo/graph_break_registry.json
index 28fd02294ad3c..4a873bbb0ce32 100644
--- a/torch/_dynamo/graph_break_registry.json
+++ b/torch/_dynamo/graph_break_registry.json
@@ -2718,5 +2718,13 @@
       "Explanation": "Dyanmo does not support tracing mutations on a class when its __dict__ is materialized",
       "Hints": []
     }
+  ],
+  "GB0272": [
+    {
+      "Gb_type": "Failed to make weakref to User Object",
+      "Context": "user_objected: {obj}",
+      "Explanation": "Object does not allow us to make a weakref to it",
+      "Hints": []
+    }
   ]
 }
diff --git a/torch/_dynamo/graph_utils.py b/torch/_dynamo/graph_utils.py
index 1e54ba95b3883..a7429e83174b8 100644
--- a/torch/_dynamo/graph_utils.py
+++ b/torch/_dynamo/graph_utils.py
@@ -1,8 +1,10 @@
 from collections import deque
-from typing import Any
+from typing import Any, Optional
 
+import torch
 from torch.fx import Graph, map_arg, Node
 from torch.utils._ordered_set import OrderedSet
+from torch.utils._pytree import tree_flatten
 
 
 # flattens with support for slices
@@ -75,3 +77,40 @@ def current_path_head() -> Node:
                 pending.append((child, cur_node))
 
     return "no cycle detected"
+
+
+def _graph_device_type(graph: Optional[Graph]) -> str:
+    if graph is None:
+        return "cpu"
+
+    def _device_type(x: Any) -> str:
+        if isinstance(x, torch.device):
+            return x.type
+        if isinstance(x, torch.Tensor):
+            return x.device.type
+        return "cpu"
+
+    def _flatten_meta(node: Node, key: str) -> list[Any]:
+        if key not in node.meta:
+            return []
+        flat, _ = tree_flatten(node.meta[key])
+        return flat
+
+    for node in graph.nodes:
+        for key in ("val", "example_value"):
+            for obj in _flatten_meta(node, key):
+                return _device_type(obj)
+
+        # Check for device conversions
+        if node.op == "call_method":
+            for gpu in ["cuda", "xpu"]:
+                if node.target == gpu:
+                    return gpu
+                if node.target == "to" and gpu in node.args:
+                    return gpu
+
+        # Check args/kwargs for non-CPU device specs
+        flat_args, _ = tree_flatten((node.args, node.kwargs))
+        for obj in flat_args:
+            return _device_type(obj)
+    return "cpu"
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index be7ff5051f2d5..f438a2d1390c5 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -136,6 +136,7 @@
     DefaultsSource,
     DictGetItemSource,
     DictSubclassGetItemSource,
+    DynamicScalarSource,
     FlattenScriptObjectSource,
     FloatTensorSource,
     FSDPNNModuleSource,
@@ -211,7 +212,7 @@
     from sympy import Symbol
 
     from torch._C import DispatchKeySet
-    from torch._dynamo.output_graph import OutputGraph, OutputGraphGuardsState
+    from torch._dynamo.output_graph import OutputGraphCommon, OutputGraphGuardsState
 
 T = TypeVar("T")
 log = logging.getLogger(__name__)
@@ -234,6 +235,20 @@
 )
 
 
+def get_framelocals_idx(code: types.CodeType, var_name: str) -> int:
+    # Refer to index in the frame's localsplus directly.
+    # NOTE: name order for a code object doesn't change.
+    # NOTE: we need to find the LAST matching index because <= 3.10 contains
+    # duplicate names in the case of cells: a name can be both local and cell
+    # and will take up 2 slots of the frame's localsplus. The correct behavior
+    # is to refer to the cell, which has a higher index.
+    framelocals_names_reversed = code_framelocals_names_reversed_cached(code)
+    framelocals_idx = (
+        len(framelocals_names_reversed) - framelocals_names_reversed.index(var_name) - 1
+    )
+    return framelocals_idx
+
+
 class IndentedBufferWithPrefix(IndentedBuffer):
     def prefix(self) -> str:
         return "| " * (self._indent * self.tabwidth)
@@ -1031,6 +1046,7 @@ def __init__(
         self.already_guarded_not_present_in_generic_dict: OrderedSet[
             tuple[str, str]
         ] = OrderedSet()
+        self.guard_tree_values: dict[int, Any] = {}
 
     def guard_on_dict_keys_and_ignore_order(
         self, example_value: dict[Any, Any], guard: Guard
@@ -1324,6 +1340,7 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
 
         if source_name != "":
             example_value = self.get(source_name)
+            self.guard_tree_values[id(example_value)] = example_value
 
         guard_manager_enum = self.get_guard_manager_type(source, example_value)
 
@@ -1342,20 +1359,7 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
 
         # Use istype instead of isinstance to check for exact type of source.
         if istype(source, LocalSource):
-            # Refer to index in the frame's localsplus directly.
-            # NOTE: name order for a code object doesn't change.
-            # NOTE: we need to find the LAST matching index because <= 3.10 contains
-            # duplicate names in the case of cells: a name can be both local and cell
-            # and will take up 2 slots of the frame's localsplus. The correct behavior
-            # is to refer to the cell, which has a higher index.
-            framelocals_names_reversed = code_framelocals_names_reversed_cached(
-                self.f_code
-            )
-            framelocals_idx = (
-                len(framelocals_names_reversed)
-                - framelocals_names_reversed.index(source.local_name)
-                - 1
-            )
+            framelocals_idx = get_framelocals_idx(self.f_code, source.local_name)
             out = root_guard_manager.framelocals_manager(
                 key=(source.local_name, framelocals_idx),
                 source=source_name,
@@ -1713,6 +1717,14 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+        elif istype(source, DynamicScalarSource):
+            assert base_guard_manager
+            out = base_guard_manager.lambda_manager(
+                python_lambda=lambda x: int(x),
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
         else:
             raise AssertionError(
                 f"missing guard manager builder {source} - {source.name()}"
@@ -2033,14 +2045,8 @@ def DUAL_LEVEL(self, guard: Guard) -> None:
         dual_level = self.check_fn_manager.output_graph.dual_level
         code = [f"torch.autograd.forward_ad._current_level == {dual_level}"]
         self._set_guard_export_info(guard, code)
-        # TODO(anijain2305) - Consider this moving this guard to C++
-        forward_ad = torch.autograd.forward_ad
-
-        def fn(x: Any) -> bool:
-            return forward_ad._current_level == dual_level
-
-        self.guard_manager.root.add_lambda_guard(
-            fn, get_verbose_code_parts(code, guard)
+        self.guard_manager.root.add_dual_level_match_guard(
+            dual_level, get_verbose_code_parts(code, guard)
         )
 
     def FUNCTORCH_STACK_MATCH(self, guard: Guard) -> None:
@@ -2055,10 +2061,10 @@ def FUNCTORCH_STACK_MATCH(self, guard: Guard) -> None:
         # TODO(anijain2305) - Consider this moving this guard to C++
         compare_fn = torch._functorch.pyfunctorch.compare_functorch_state
 
-        def fn(x: Any) -> bool:
+        def fn() -> bool:
             return compare_fn(states)
 
-        self.guard_manager.root.add_lambda_guard(
+        self.guard_manager.root.add_lambda_guard_no_args(
             fn, get_verbose_code_parts(code, guard)
         )
 
@@ -2084,10 +2090,10 @@ def hooks_ids_fn(
         ]
         self._set_guard_export_info(guard, code)
 
-        def fn(x: Any) -> bool:
+        def fn() -> bool:
             return guard_hooks_ids == hooks_ids_fn(get_hooks())
 
-        self.guard_manager.root.add_lambda_guard(
+        self.guard_manager.root.add_lambda_guard_no_args(
             fn, get_verbose_code_parts(code, guard)
         )
 
@@ -2108,7 +2114,7 @@ def metadata_checker(x: Any) -> bool:
                 return x.__tensor_flatten__()[1] == original_metadata
 
         global_name = f"___check_metadata_{id(metadata_checker)}_c{CompileContext.current_compile_id()}"
-        self.get_guard_manager(guard).add_lambda_guard(
+        self.get_guard_manager(guard).add_lambda_guard_no_framelocals(
             metadata_checker, get_verbose_code_parts(global_name, guard)
         )
 
@@ -2178,26 +2184,20 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> No
 
         # Special case for nan because float("nan") == float("nan") evaluates to False
         if istype(val, float) and math.isnan(val):
-            self.TYPE_MATCH(guard)
-            code = []
-            code.append(f"__math_isnan({ref})")
+            code = [f"(type({ref}) is float and __math_isnan({ref}))"]
             self._set_guard_export_info(guard, code)
 
-            self.get_guard_manager(guard).add_lambda_guard(
-                _get_closure_vars()["__math_isnan"],  # type: ignore[arg-type]
+            self.get_guard_manager(guard).add_float_is_nan_guard(
                 get_verbose_code_parts(code, guard),
             )
             return
 
         # Python math library doesn't support complex nan, so we need to use numpy
         if istype(val, complex) and np.isnan(val):
-            self.TYPE_MATCH(guard)
-            code = []
-            code.append(f"__numpy_isnan({ref})")
+            code = [f"(type({ref}) is complex and __numpy_isnan({ref}))"]
             self._set_guard_export_info(guard, code)
 
-            self.get_guard_manager(guard).add_lambda_guard(
-                _get_closure_vars()["__numpy_isnan"],  # type: ignore[arg-type]
+            self.get_guard_manager(guard).add_complex_is_nan_guard(
                 get_verbose_code_parts(code, guard),
             )
             return
@@ -2456,7 +2456,7 @@ def DEFAULT_DEVICE(self, guard: Guard) -> None:
         )
 
     def SHAPE_ENV(self, guard: Guard) -> None:
-        from torch._dynamo.output_graph import OutputGraph
+        from torch._dynamo.output_graph import OutputGraphCommon
 
         assert guard.name == ""
         output_graph = self.check_fn_manager.output_graph
@@ -2473,8 +2473,9 @@ def SHAPE_ENV(self, guard: Guard) -> None:
             # shape variables to sources from tracked_fakes.  This must happen after
             # tensor checks.
             # NB: self.output_graph can be None in the debug_nops tests
-            assert isinstance(output_graph, OutputGraph)
-            fs = output_graph.tracked_fakes
+            assert isinstance(output_graph, OutputGraphCommon)
+            assert output_graph.shape_env is not None
+            fs = output_graph.shape_env.tracked_fakes or []
             input_contexts = [a.symbolic_context for a in fs]
 
             def get_sources(t_id: int, dim: int) -> list[Source]:
@@ -2485,7 +2486,6 @@ def get_sources(t_id: int, dim: int) -> list[Source]:
                     for source in output_graph.tracked_fakes_id_to_source[t_id]
                 ]
 
-            assert output_graph.shape_env is not None
             if output_graph.export_constraints:
                 names: dict[str, tuple[int, int]] = {}
                 source_pairs: list[tuple[Source, Source]] = []
@@ -3076,11 +3076,34 @@ class _Missing:
     pass
 
 
+@functools.cache
+def _get_unsupported_types() -> tuple[type, ...]:
+    # We only do ID_MATCH on C objects which is already banned from guards serialization.
+    ret: tuple[type, ...] = (
+        types.CodeType,
+        torch._C.Stream,
+        weakref.ReferenceType,
+    )
+    try:
+        ret += (torch._C._distributed_c10d.ProcessGroup,)
+    except AttributeError:
+        pass
+    return ret
+
+
 class GuardsStatePickler(pickle.Pickler):
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
+    def __init__(
+        self,
+        guard_tree_values: dict[int, Any],
+        empty_values: dict[int, Any],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(*args, **kwargs)
         self.fake_mode = torch._subclasses.FakeTensorMode()
         self.tensor_converter = torch._subclasses.fake_tensor.FakeTensorConverter()
+        self.guard_tree_values = guard_tree_values
+        self.empty_values = empty_values
 
     @classmethod
     def _unpickle_module(cls, state: Any) -> torch.nn.Module:
@@ -3152,6 +3175,18 @@ def _unpickle_mapping_proxy(
     ) -> types.MappingProxyType[Any, Any]:
         return types.MappingProxyType(d)
 
+    @classmethod
+    def _unpickle_dict_keys(cls, elems: list[Any]) -> Any:
+        return dict.fromkeys(elems).keys()
+
+    @classmethod
+    def _unpickle_fsdp_module_type(
+        cls, original_type: type[torch.nn.Module]
+    ) -> type[torch.nn.Module]:
+        return torch.distributed.fsdp._fully_shard._fully_shard.get_cls_to_fsdp_cls()[
+            original_type
+        ]
+
     @classmethod
     def _unpickle_c_op(cls, name: str) -> Any:
         return getattr(torch.ops._C, name)
@@ -3161,9 +3196,15 @@ def reducer_override(
     ) -> Union[tuple[Callable[..., Any], tuple[Any, ...]], Any]:
         import sympy
 
+        if id(obj) in self.empty_values:
+            return type(obj).__new__, (type(obj),)
+
         if isinstance(obj, torch.Tensor) and obj.device.type != "meta":
             from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
+            if id(obj) not in self.guard_tree_values:
+                return _Missing, ()
+
             if is_traceable_wrapper_subclass(obj):
                 # inner_data is a list of tuples of:
                 #   (inner attr name, unpickle func, tuple of func inputs)
@@ -3173,6 +3214,8 @@ def reducer_override(
                 # recursively call for inner tensor components
                 for attr in attrs:
                     inner = getattr(obj, attr)
+                    if isinstance(inner, torch.Tensor):
+                        self.guard_tree_values[id(inner)] = inner
                     func, args_tuple = self.reducer_override(inner)
                     inner_data.append((attr, func, args_tuple))
 
@@ -3222,6 +3265,9 @@ def reducer_override(
         elif isinstance(obj, types.MappingProxyType):
             return type(self)._unpickle_mapping_proxy, (obj.copy(),)
 
+        elif isinstance(obj, torch._dynamo.utils.dict_keys):
+            return type(self)._unpickle_dict_keys, (list(obj),)
+
         elif isinstance(
             obj, torch._ops.OpOverloadPacket
         ) and obj._qualified_op_name.startswith("_C::"):
@@ -3234,14 +3280,18 @@ def reducer_override(
             # Skipping PyCapsule since there isn't much to be guarded about them.
             return _Missing, ()
 
-        elif isinstance(obj, types.CodeType):
-            # We only do ID_MATCH on code objects which is already banned from guards serialization.
+        elif isinstance(obj, _get_unsupported_types()):
             return _Missing, ()
 
-        elif inspect.isfunction(obj) and (obj.__code__.co_flags & inspect.CO_NESTED):
-            # Skipping nested function since CLOSURE_MATCH is banned from guards serialization.
-            assert obj.__qualname__ != obj.__name__
-            return _Missing, ()
+        elif inspect.isfunction(obj):
+            if obj.__code__.co_flags & inspect.CO_NESTED:
+                return _Missing, ()
+            if obj.__module__ in sys.modules:
+                f = sys.modules[obj.__module__]
+                for name in obj.__qualname__.split("."):
+                    f = getattr(f, name, None)  # type: ignore[assignment]
+                if f is not obj:
+                    return _Missing, ()
 
         if type(obj).__qualname__ != type(obj).__name__:
             raise torch._dynamo.exc.PackageError(
@@ -3250,12 +3300,44 @@ def reducer_override(
                 + "Please define the class at global scope (top level of a module)."
             )
 
+        if hasattr(torch.distributed, "distributed_c10d") and isinstance(
+            obj, torch.distributed.distributed_c10d.Work
+        ):
+            if id(obj) not in self.guard_tree_values:
+                return _Missing, ()
+
+        if (
+            inspect.isclass(obj)
+            and hasattr(torch.distributed, "fsdp")
+            and issubclass(obj, torch.distributed.fsdp._fully_shard.FSDPModule)
+        ):
+            if obj is not torch.distributed.fsdp._fully_shard.FSDPModule:
+                original_type = obj.__mro__[2]
+                assert issubclass(original_type, torch.nn.Module)
+                assert (
+                    original_type
+                    in torch.distributed.fsdp._fully_shard._fully_shard.get_cls_to_fsdp_cls()
+                )
+                return type(self)._unpickle_fsdp_module_type, (original_type,)
+
         return NotImplemented
 
 
-def pickle_guards_state(state: GuardsState) -> bytes:
+def pickle_guards_state(state: GuardsState, guard_tree_values: dict[int, Any]) -> bytes:
     buf = io.BytesIO()
-    pickler = GuardsStatePickler(buf)
+    empty_values = {}
+
+    leaves = pytree.tree_leaves(state.output_graph.local_scope)
+    for leaf in leaves:
+        if inspect.ismethod(leaf) and hasattr(leaf, "__self__"):
+            base = leaf.__self__
+            if id(base) not in guard_tree_values:
+                try:
+                    type(base).__new__(type(base))
+                    empty_values[id(base)] = base
+                except:  # noqa: E722, B001
+                    pass
+    pickler = GuardsStatePickler(guard_tree_values, empty_values, buf)
     try:
         pickler.dump(state)
     except AttributeError as e:
@@ -3272,7 +3354,7 @@ class CheckFunctionManager:
     def __init__(
         self,
         f_code: types.CodeType,
-        output_graph: OutputGraphGuardsState,
+        output_graph: OutputGraphCommon,
         cache_entry: Optional[CacheEntry] = None,
         guard_fail_fn: Optional[Callable[[GuardFail], None]] = None,
         guard_filter_fn: Optional[
@@ -3289,7 +3371,7 @@ def __init__(
         existing_diff_guard_sources = (
             update_diff_guard_managers_for_existing_cache_entries(cache_entry)
         )
-        self.output_graph: Optional[OutputGraphGuardsState] = output_graph
+        self.output_graph: Optional[OutputGraphCommon] = output_graph
         assert self.output_graph is not None
 
         # Only used for serialization.
@@ -3318,8 +3400,15 @@ def guard_filter_fn(guards: list[GuardFilterEntry]) -> list[bool]:
                     if not keep:
                         ret.append(False)
                     elif (
-                        g.guard_type in ("ID_MATCH", "CLOSURE_MATCH", "WEAKREF_ALIVE")
+                        g.guard_type
+                        in (
+                            "ID_MATCH",
+                            "CLOSURE_MATCH",
+                            "WEAKREF_ALIVE",
+                            "DICT_VERSION",
+                        )
                         or "ID_MATCH" in g.derived_guard_types
+                        or "DICT_VERSION" in g.derived_guard_types
                     ):
                         log.warning(
                             "%s guard on %s is dropped with caching_precompile=True.",
@@ -3447,9 +3536,9 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
 
         self.guards_state: Optional[bytes] = None
         if save_guards:
-            from torch._dynamo.output_graph import OutputGraph
+            from torch._dynamo.output_graph import OutputGraphCommon
 
-            assert isinstance(self.output_graph, OutputGraph)
+            assert isinstance(self.output_graph, OutputGraphCommon)
             try:
                 self.guards_state = self.serialize_guards(
                     builder, sorted_guards, self.output_graph
@@ -3491,7 +3580,7 @@ def serialize_guards(
         self,
         builder: GuardBuilder,
         sorted_guards: list[Guard],
-        output_graph: OutputGraph,
+        output_graph: OutputGraphCommon,
     ) -> bytes:
         # We check whether our list of guards are serializable here
         for guard in sorted_guards:
@@ -3523,7 +3612,7 @@ def serialize_guards(
                     f"{failed} guard cannot be serialized."
                 )
 
-        builtins_dict_name = output_graph.name_of_builtins_dict_key_in_fglobals
+        builtins_dict_name = output_graph.name_of_builtins_dict_key_in_fglobals or ""
         used_global_vars = set()
         used_local_vars = set()
 
@@ -3608,7 +3697,7 @@ def _ref(x: Any) -> Any:
             shape_code_parts=self.shape_code_parts,
         )
 
-        return pickle_guards_state(guards_state)
+        return pickle_guards_state(guards_state, builder.guard_tree_values)
 
     def build_guards(
         self,
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 4cdf353da99ed..8812215c435fa 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -345,6 +345,26 @@ def guards(self) -> torch._guards.GuardsSet:
     def aotautograd_guards(self) -> list[torch._guards.GuardEnvExpr]:
         return self._aotautograd_guards
 
+    def dump_guards_state(self) -> "OutputGraphGuardsState":
+        # Dump a serializable version of self without extras
+        return OutputGraphGuardsState(
+            local_scope=self.local_scope,
+            global_scope=self.global_scope,
+            torch_function_mode_stack=self.torch_function_mode_stack,
+            guard_on_key_order=self.guard_on_key_order,
+            input_source_to_sizes_strides=self.input_source_to_sizes_strides,
+            dual_level=self.dual_level,
+            functorch_layers=self.functorch_layers,
+            current_device=self.current_device,
+            global_state_guard=self.global_state_guard,
+            name_of_builtins_dict_key_in_fglobals=self.name_of_builtins_dict_key_in_fglobals,
+            export=self.export,
+            export_constraints=self.export_constraints,
+            _guards=self.guards,
+            _aotautograd_guards=self.aotautograd_guards,
+            skip_guards_check=self.skip_guards_check,
+        )
+
 
 @dataclass
 class StackLocalsMetadata:
@@ -379,6 +399,10 @@ class ExportMetaData:
     out_spec: Union[torch.utils._pytree.TreeSpec, torch.utils._pytree.LeafSpec] = (
         torch.utils._pytree._LEAF_SPEC
     )
+    module_call_spec: dict[
+        str,
+        dict[str, Union[torch.utils._pytree.TreeSpec, torch.utils._pytree.LeafSpec]],
+    ] = dc_field(default_factory=dict)
 
 
 def get_builtins_dict(global_scope: Scope) -> dict[str, Any]:
@@ -401,7 +425,78 @@ def get_builtins_dict(global_scope: Scope) -> dict[str, Any]:
     return f_builtins
 
 
-class OutputGraph(OutputGraphGuardsState):
+class OutputGraphCommon(OutputGraphGuardsState):
+    """
+    A minimal interface for full graph capture. It is intended to be
+    the target of any tracer that feeds into backends.
+
+    Currently dynamo's OutputGraph is the only known implementation
+    of this interface, used by (aot) precompile and (strict) export.
+    Importantly, that implementation also contains many other fields
+    that are using during tracing but not included in this interface
+    because they are not used once tracing is complete.
+
+    It should be safe to assume that (caching) precompile also uses
+    this interface.
+
+    In the future, we want make_fx, used by (non-strict) export, to
+    also implement this interface.
+
+    The serializable part of this interface is OutputGraphGuardsState.
+    We do not need to serialize other parts; however it will pay to
+    be disciplined about what those other parts are, especially since
+    we want other tracers to be able to meaningfully implement them,
+    and we should generally try to cut them down when possible.
+    """
+
+    def __init__(
+        self,
+        output_graph_guards_state: OutputGraphGuardsState,
+        shape_env: Optional[ShapeEnv] = None,
+        export_metadata: Optional[ExportMetaData] = None,
+        tracked_fakes_id_to_source: Optional[dict[int, list[Source]]] = None,
+    ):
+        super().__init__(
+            output_graph_guards_state.local_scope,
+            output_graph_guards_state.global_scope,
+            output_graph_guards_state.torch_function_mode_stack,
+            output_graph_guards_state.guard_on_key_order,
+            output_graph_guards_state.input_source_to_sizes_strides,
+            output_graph_guards_state.dual_level,
+            output_graph_guards_state.functorch_layers,
+            output_graph_guards_state.current_device,
+            output_graph_guards_state.global_state_guard,
+            output_graph_guards_state._guards,
+            output_graph_guards_state._aotautograd_guards,
+            output_graph_guards_state.export,
+            output_graph_guards_state.skip_guards_check,
+            output_graph_guards_state.export_constraints,
+            output_graph_guards_state.name_of_builtins_dict_key_in_fglobals,
+        )
+
+        # The following fields are currently known to be used by clients.
+        # In particular, we need:
+        # - shape_env, for building guards
+        # - export_metadata, for un/flattening inputs and outputs
+        # - tracked_fakes_id_to_source, for processing tensor dim constraints
+        self._shape_env = shape_env or ShapeEnv()  # private for inheritance
+        self.export_metadata = export_metadata or ExportMetaData()
+        self.tracked_fakes_id_to_source: dict[int, list[Source]] = (
+            tracked_fakes_id_to_source or {}
+        )
+
+    @property
+    def shape_env(self) -> ShapeEnv:
+        return self._shape_env
+
+    def bypass_package(self, reason: str = "", **kwargs: Any) -> None:
+        # NOTE: currently there are no tests for this but it is reachable
+        # when building guards, so technically necessary to include here.
+        # It is unclear whether we should include packaging altogether.
+        raise NotImplementedError
+
+
+class OutputGraph(OutputGraphCommon):
     """
     Wrapper class to hold outputs of InstructionTranslator.  Mainly the
     generated fx.Graph.
@@ -427,8 +522,10 @@ def __init__(
         f_code: CodeType,
         torch_function_mode_stack: list[torch.overrides.TorchFunctionMode],
         package: Optional["CompilePackage"],
+        one_graph: bool = False,
     ) -> None:
-        super().__init__(
+        OutputGraphGuardsState.__init__(
+            self,
             local_scope,
             global_scope,
             torch_function_mode_stack,
@@ -483,8 +580,10 @@ def __init__(
             # TrackedFake instances may have its metadata changed throughout
             # the program execution.
             tracked_fakes=self.tracked_fakes,
-            allow_scalar_outputs=config.capture_scalar_outputs,
-            allow_dynamic_output_shape_ops=config.capture_dynamic_output_shape_ops,
+            # We want to allow capture scalar outputs and allow_dynamic_output_shape_ops when fullgraph=True
+            allow_scalar_outputs=one_graph or config.capture_scalar_outputs,
+            allow_dynamic_output_shape_ops=one_graph
+            or config.capture_dynamic_output_shape_ops,
             prefer_deferred_runtime_asserts_over_guards=config.prefer_deferred_runtime_asserts_over_guards,
             co_fields=self.co_fields,
         )
@@ -502,6 +601,7 @@ def __init__(
             )
         self.tracing_context: TracingContext = TracingContext(fake_mode)
         self.tracing_context.traced_code.append(f_code)
+        self.traced_code = self.tracing_context.traced_code
         self.dynamo_compile_id: Optional[CompileId] = (
             CompileContext.current_compile_id()
         )
@@ -720,26 +820,6 @@ def maybe_install_saved_tensors_hooks_subgraphs(self) -> Optional[list[str]]:
         assert unpack_subgraph_name == "saved_tensors_hooks_unpack_0"
         return [pack_subgraph_name, unpack_subgraph_name]
 
-    def dump_guards_state(self) -> OutputGraphGuardsState:
-        # Dump a serializable version of self without extras
-        return OutputGraphGuardsState(
-            local_scope=self.local_scope,
-            global_scope=self.global_scope,
-            torch_function_mode_stack=self.torch_function_mode_stack,
-            guard_on_key_order=self.guard_on_key_order,
-            input_source_to_sizes_strides=self.input_source_to_sizes_strides,
-            dual_level=self.dual_level,
-            functorch_layers=self.functorch_layers,
-            current_device=self.current_device,
-            global_state_guard=self.global_state_guard,
-            name_of_builtins_dict_key_in_fglobals=self.name_of_builtins_dict_key_in_fglobals,
-            export=self.export,
-            export_constraints=self.export_constraints,
-            _guards=self.guards,
-            _aotautograd_guards=self.aotautograd_guards,
-            skip_guards_check=self.skip_guards_check,
-        )
-
     def synthetic_graph_input(
         self, fn: Callable[..., Any], args: tuple[Any, ...]
     ) -> VariableTracker:
@@ -1695,6 +1775,19 @@ def compile_subgraph(
                     if isinstance(
                         mut_type, (AttributeMutationExisting, ValueMutationExisting)
                     ):
+                        if isinstance(var, UserDefinedDictVariable) and isinstance(
+                            var.value, _ExportModuleSpecTrackerDict
+                        ):
+                            for k, v in var.items.items():
+                                specs = {}
+                                for k_spec, val in v.items.items():
+                                    specs[k_spec.vt.as_python_constant()] = (
+                                        val.as_python_constant()
+                                    )
+                                assert ["in_spec", "out_spec"] == list(specs.keys())
+                                self.export_metadata.module_call_spec[
+                                    k.vt.as_python_constant()
+                                ] = specs
                         # export uses tracepoint pass to dump submodule inp/out spec
                         # into global state, so we filter it here
                         if not (
@@ -2677,6 +2770,9 @@ def __init__(
         # tracer is the current tracer that's readily accessible in current tracer's graph.
         self.bound_symbols: dict[sympy.Symbol, Union[torch.fx.Proxy, LazyProxy]] = {}
 
+        # Maps _DynamicScalar object ids to allocated SymInt nodes, for symbol reuse
+        self.dynamic_scalar_nodes: dict[int, torch.SymInt] = {}
+
         self.prev_inst = None
         # True if this tracer is currently tracing into torch.utils.checkpoint
         # as part of speculate_subgraph.
diff --git a/torch/_dynamo/package.py b/torch/_dynamo/package.py
index 9aa00a6a9d1e3..25955e571b0fa 100644
--- a/torch/_dynamo/package.py
+++ b/torch/_dynamo/package.py
@@ -28,11 +28,8 @@
 from typing_extensions import Never
 
 import torch
-import torch._inductor.package
 from torch._dynamo.exc import PackageError
-from torch._dynamo.precompile_context import PrecompileCacheArtifact, PrecompileContext
-from torch._inductor.runtime.cache_dir_utils import cache_dir
-from torch.compiler._cache import CacheArtifactFactory
+from torch._dynamo.graph_utils import _graph_device_type
 
 from .bytecode_transformation import get_code_keys
 from .utils import dynamo_timed, increment_frame
@@ -109,6 +106,35 @@ class InlinedSource:
     firstlineno: int
     lastlineno: int
     checksum: str
+    content: str
+
+
+@functools.cache
+def _get_module_content(module: types.ModuleType) -> str:
+    return inspect.getsource(module)
+
+
+@dataclasses.dataclass
+class SourceInfo:
+    inlined_sources: set[InlinedSource]
+
+    def add_code(self, code: types.CodeType) -> None:
+        module = inspect.getmodule(code)
+        if module is None:
+            return
+        sourcelines, firstlineno = inspect.getsourcelines(code)
+        lastlineno = firstlineno + len(sourcelines)
+        source = "".join(sourcelines)
+        assert source == "".join(_get_sourcelines(module, firstlineno, lastlineno))
+        self.inlined_sources.add(
+            InlinedSource(
+                module=module.__name__,
+                firstlineno=firstlineno,
+                lastlineno=lastlineno,
+                checksum=_hash_source(source),
+                content=_get_module_content(module),
+            )
+        )
 
 
 @dataclasses.dataclass
@@ -271,26 +297,127 @@ def _find_code_source(obj: Any) -> Optional[str]:
     return toplevel.__qualname__, code_source.strip(".")
 
 
+@dataclasses.dataclass(frozen=True)
+class SystemInfo:
+    """
+    System information including Python, PyTorch, and GPU details.
+    This information is used to ensure compiled artifacts can only be loaded
+    with compatible system configurations.
+    """
+
+    python_version: str
+    torch_version: str
+    toolkit_version: Optional[str]
+    triton_version: Optional[tuple[int, int]]
+    gpu_name: Optional[str]
+    CHECK_GPUS = ("cuda", "xpu")
+
+    @classmethod
+    def current(cls) -> "SystemInfo":
+        """Create a SystemInfo instance with current system information."""
+        # Get GPU name if CUDA or XPU is available
+        gpu_name = None
+        from torch.utils._triton import get_triton_version
+
+        gpu_name, toolkit_version = None, None
+        for device_type in cls.CHECK_GPUS:
+            if getattr(torch, device_type).is_available():
+                try:
+                    gpu_name = getattr(torch, device_type).get_device_name()
+                    toolkit_version = getattr(torch.version, device_type)
+                    break
+                except Exception:
+                    pass
+
+        return cls(
+            python_version=platform.python_version(),
+            torch_version=torch.__version__,
+            toolkit_version=toolkit_version,
+            triton_version=get_triton_version((0, 0)),
+            gpu_name=gpu_name,
+        )
+
+    def check_compatibility(
+        self, other: "SystemInfo", device_type: str = "cpu"
+    ) -> None:
+        """
+        Check if this SystemInfo is compatible with another SystemInfo.
+        Raises RuntimeError if incompatible.
+        """
+        if self.python_version != other.python_version:
+            raise RuntimeError(
+                f"Compile package was created with a different Python version: {self.python_version}"
+            )
+
+        if self.torch_version != other.torch_version:
+            raise RuntimeError(
+                f"Compile package was created with a different PyTorch version: {self.torch_version}"
+            )
+        if device_type in self.CHECK_GPUS:
+            if not getattr(torch, device_type).is_available():
+                raise RuntimeError(f"{device_type} is not available")
+
+            if self.toolkit_version != other.toolkit_version:
+                raise RuntimeError(
+                    f"Compile package was created with a different toolkit version: {self.toolkit_version}"
+                )
+
+            if (
+                other.triton_version != (0, 0)
+                and self.triton_version != other.triton_version
+            ):
+                raise RuntimeError(
+                    f"Compile package was created with a different Triton version: {self.triton_version}"
+                )
+
+            # Check GPU name if CUDA/XPU was used
+            if other.gpu_name is not None and self.gpu_name != other.gpu_name:
+                raise RuntimeError(
+                    f"Compile package was created with different GPU: "
+                    f"cached={self.gpu_name}, current={other.gpu_name}"
+                )
+
+
 @dataclasses.dataclass
 class _DynamoCacheEntry:
     codes: list[_DynamoCodeCacheEntry]
-    inlined_sources: set[InlinedSource]
-    python_version: str = platform.python_version()
-    torch_version: str = torch.__version__
+    source_info: SourceInfo
+    device_type: str
+    system_info: SystemInfo = dataclasses.field(default_factory=SystemInfo.current)
+    fn_name: Optional[str] = None
+    fn_first_lineno: Optional[str] = None
 
     @property
     def backend_ids(self) -> set[_BackendId]:
         return {backend_id for code in self.codes for backend_id in code.backend_ids}
 
+    def check_versions(self) -> None:
+        """Check if the current system is compatible with the system used to create this cache entry."""
+        current_system_info = SystemInfo.current()
+        self.system_info.check_compatibility(current_system_info, self.device_type)
 
-@CacheArtifactFactory.register
-class _DynamoCacheArtifact(PrecompileCacheArtifact[_DynamoCacheEntry]):
-    @staticmethod
-    def type() -> str:
-        return "precompile_dynamo"
+    def debug_info(self) -> dict[str, Any]:
+        assert len(self.codes) > 0
+        return {
+            "num_codes": str(len(self.codes)),
+            "fn_name": self.fn_name,
+            "fn_first_lineno": self.fn_first_lineno,
+            "device_type": self.device_type,
+            "backend_ids": list(self.backend_ids),
+        }
+
+
+@dataclasses.dataclass
+class PrecompileCacheEntry:
+    """
+    A full cache entry for caching precompile, for a toplevel torch.compile.
+    Consists of a _DynamoCacheEntry, which contains all the dynamo related contents,
+    and a set of backends content. In general, the backend content here will always
+    be of type precompile_context.BackendCacheArtifact
+    """
 
-    def after_deserialization(self) -> _DynamoCacheEntry:
-        return pickle.loads(self.content)
+    dynamo: _DynamoCacheEntry
+    backends: dict[_BackendId, Any]
 
 
 def _hash_source(source: str) -> str:
@@ -369,10 +496,12 @@ def __init__(
 
         self._current_entry: Optional[_DynamoCodeCacheEntry] = None
         self._installed_globals: dict[types.ModuleType, list[str]] = {}
+        # device_type that model compiled with.
+        self._device_type = "cpu"
 
         # For debugging/testing purpose only.
         self._cached_backends: dict[_BackendId, Any] = {}
-        self._inlined_sources: set[InlinedSource] = set()
+        self._source_info: SourceInfo = SourceInfo(inlined_sources=set())
         self._resume_codes: set[types.CodeType] = set()
         self._initialized = False
         if fn is not None:
@@ -392,21 +521,14 @@ def initialize(
         from .eval_frame import innermost_fn
 
         assert not self._initialized
-        self._inlined_sources = set()
+        self._source_info = SourceInfo(inlined_sources=set())
         self._innermost_fn = innermost_fn(fn)  # type: ignore[assignment]
         assert self._innermost_fn is not None
         if dynamo is not None:
             assert isinstance(dynamo, _DynamoCacheEntry)
-            if dynamo.python_version != platform.python_version():
-                raise RuntimeError(
-                    f"Compile package was created with a different Python version: {dynamo.python_version}"
-                )
-            if dynamo.torch_version != torch.__version__:
-                raise RuntimeError(
-                    f"Compile package was created with a different PyTorch version: {dynamo.torch_version}"
-                )
+            dynamo.check_versions()
             if not ignore_inlined_sources:
-                for code in dynamo.inlined_sources:
+                for code in dynamo.source_info.inlined_sources:
                     m = importlib.import_module(code.module)
                     checksum = _hash_sourcelines(m, code.firstlineno, code.lastlineno)
                     if checksum != code.checksum:
@@ -414,7 +536,7 @@ def initialize(
                             f"Source code changes detected for {code.module} (line {code.firstlineno} - line {code.lastlineno})"
                         )
 
-                self._inlined_sources = dynamo.inlined_sources
+                self._source_info = dynamo.source_info
 
             main, *codes = dynamo.codes
             self._codes = {self._innermost_fn.__code__: main}
@@ -518,21 +640,10 @@ def add_inlined_source(self, sources: list[types.CodeType]) -> None:
         for code in sources:
             if code in self._resume_codes:
                 continue
-            module = inspect.getmodule(code)
-            if module is None:
-                continue
-            sourcelines, firstlineno = inspect.getsourcelines(code)
-            lastlineno = firstlineno + len(sourcelines)
-            source = "".join(sourcelines)
-            assert source == "".join(_get_sourcelines(module, firstlineno, lastlineno))
-            self._inlined_sources.add(
-                InlinedSource(
-                    module=module.__name__,
-                    firstlineno=firstlineno,
-                    lastlineno=lastlineno,
-                    checksum=_hash_source(source),
-                )
-            )
+            self._source_info.add_code(code)
+
+    def update_device_type(self, graph: Optional[torch.fx.Graph]) -> None:
+        self._device_type = _graph_device_type(graph)
 
     def bypass_current_entry(self) -> None:
         assert self._current_entry is not None
@@ -595,7 +706,7 @@ def install(self, backends: dict[_BackendId, Any]) -> None:
         """
         from torch._C._dynamo.eval_frame import _load_precompile_entry
 
-        from .output_graph import get_builtins_dict
+        from .output_graph import get_builtins_dict, OutputGraphCommon
 
         self.uninstall()
         for code, entry in self._codes.items():
@@ -657,7 +768,7 @@ def install(self, backends: dict[_BackendId, Any]) -> None:
                     assert isinstance(guards_state, torch._dynamo.guards.GuardsState)
                     check_fn_manager = torch._dynamo.guards.CheckFunctionManager(
                         target_code,
-                        guards_state.output_graph,
+                        OutputGraphCommon(guards_state.output_graph),
                         shape_code_parts=guards_state.shape_code_parts,
                         runtime_global_scope=runtime_global_scope,
                     )
@@ -669,8 +780,13 @@ def install(self, backends: dict[_BackendId, Any]) -> None:
 
     def cache_entry(self) -> _DynamoCacheEntry:
         self.validate()
+        assert self._innermost_fn is not None
         return _DynamoCacheEntry(
-            codes=list(self._codes.values()), inlined_sources=self._inlined_sources
+            codes=list(self._codes.values()),
+            source_info=self._source_info,
+            device_type=self._device_type,
+            fn_name=self._innermost_fn.__qualname__,
+            fn_first_lineno=self._innermost_fn.__code__.co_firstlineno,
         )
 
     @staticmethod
@@ -685,17 +801,7 @@ def source_id_from_fn(fn: Callable[..., Any]) -> str:
         return sha256_hash.hexdigest()
 
 
-@CacheArtifactFactory.register
-class EagerCacheArtifact(PrecompileCacheArtifact[Any]):
-    @staticmethod
-    def type() -> str:
-        return "precompile_eager"
-
-    def after_deserialization(self) -> Any:
-        return pickle.loads(self.content)
-
-
-_Backends = dict[_BackendId, PrecompileCacheArtifact[Any]]
+_Backends = dict[_BackendId, Any]
 
 
 class DynamoStore(abc.ABC):
@@ -709,29 +815,32 @@ def record_package(self, package: CompilePackage) -> None:
         """
         Records a package to PrecompileContext, so that it can be serialized later.
         """
+        from torch._dynamo.precompile_context import PrecompileContext
+
         cache_entry = package.cache_entry()
-        pickled_result = pickle.dumps(cache_entry)
-        PrecompileContext.record_artifact(
-            _DynamoCacheArtifact.type(), key=package.source_id, content=pickled_result
+        PrecompileContext.record_dynamo_cache_entry(
+            cache_entry=cache_entry, key=package.source_id
         )
 
     def record_eager_backend(self, backend_id: _BackendId, backend: Any) -> None:
         """
         Records eager fx graphs to PrecompileContext for testing purposes.
         """
-        pickled_result = pickle.dumps(backend)
-        PrecompileContext.record_artifact(
-            EagerCacheArtifact.type(), key=backend_id, content=pickled_result
+        from torch._dynamo.precompile_context import (
+            EagerCacheArtifact,
+            PrecompileContext,
         )
 
+        result = EagerCacheArtifact(key=backend_id, content=backend)
+        PrecompileContext.record_artifact(result)
+
     @abc.abstractmethod
     def clear(self) -> None: ...
 
     @abc.abstractmethod
     def write(
         self,
-        dynamo: _DynamoCacheEntry,
-        backends: _Backends,
+        cache_entry: PrecompileCacheEntry,
         path: str,
     ) -> None:
         """
@@ -748,6 +857,11 @@ def save_cache_entry(self, cache_entry: _DynamoCacheEntry, key: str) -> None:
         """
         Saves a package to a given path. Grabs backends from PrecompileContext.
         """
+        from torch._dynamo.precompile_context import (
+            BackendCacheArtifact,
+            PrecompileContext,
+        )
+
         backend_content: _Backends = {}
         for backend_id in cache_entry.backend_ids:
             serialized_backend = PrecompileContext.serialize_artifact_by_key(backend_id)
@@ -755,10 +869,12 @@ def save_cache_entry(self, cache_entry: _DynamoCacheEntry, key: str) -> None:
                 raise RuntimeError(
                     f"Backend {backend_id} is not found in the given backends"
                 )
-            assert isinstance(serialized_backend, PrecompileCacheArtifact)
+            assert isinstance(serialized_backend, BackendCacheArtifact)
             backend_content[backend_id] = serialized_backend
 
-        self.write(cache_entry, backend_content, key)
+        entry = PrecompileCacheEntry(cache_entry, backend_content)
+
+        self.write(entry, key)
 
     def save_package(self, package: CompilePackage, key: str) -> None:
         """
@@ -769,7 +885,7 @@ def save_package(self, package: CompilePackage, key: str) -> None:
         self.save_cache_entry(cache_entry, key)
 
     @abc.abstractmethod
-    def read(self, path: str) -> tuple[_DynamoCacheEntry, _Backends]:
+    def read(self, path: str) -> PrecompileCacheEntry:
         """
         Abstract method to read dynamo cache entry and backends from storage.
 
@@ -781,17 +897,18 @@ def read(self, path: str) -> tuple[_DynamoCacheEntry, _Backends]:
         """
         ...
 
-    def load_cache_entry(
-        self, key: str
-    ) -> tuple[_DynamoCacheEntry, dict[_BackendId, Any]]:
-        cache_entry, backend_content = self.read(key)
-        for backend_id, backend in backend_content.items():
-            PrecompileContext.record_artifact(
-                backend.type(), key=backend.key, content=backend.content
-            )
-            backend_content[backend_id] = backend
+    def load_cache_entry(self, key: str) -> PrecompileCacheEntry:
+        from torch._dynamo.precompile_context import (
+            BackendCacheArtifact,
+            PrecompileContext,
+        )
 
-        return cache_entry, backend_content
+        precompile_entry = self.read(key)
+        for backend in precompile_entry.backends.values():
+            assert isinstance(backend, BackendCacheArtifact)
+            PrecompileContext.record_artifact(backend)
+
+        return precompile_entry
 
     def load_package(
         self, fn: Any, key: str
@@ -799,9 +916,9 @@ def load_package(
         """
         Loads a package from a given path and returns it plus a list of deserialized backends
         """
-        cache_entry, backend_content = self.load_cache_entry(key)
-        package = CompilePackage(fn, cache_entry)
-        return package, backend_content
+        entry = self.load_cache_entry(key)
+        package = CompilePackage(fn, entry.dynamo)
+        return package, entry.backends
 
 
 class InMemoryDynamoStore(DynamoStore):
@@ -810,23 +927,22 @@ class InMemoryDynamoStore(DynamoStore):
     """
 
     def __init__(self) -> None:
-        self.packages: dict[str, tuple[_DynamoCacheEntry, _Backends]] = {}
+        self.packages: dict[str, PrecompileCacheEntry] = {}
 
     def clear(self) -> None:
         self.packages.clear()
 
     def write(
         self,
-        dynamo: _DynamoCacheEntry,
-        backends: _Backends,
+        entry: PrecompileCacheEntry,
         path: str,
     ) -> None:
         """
         Store the dynamo cache entry and backends in memory instead of writing to disk.
         """
-        self.packages[path] = (dynamo, backends)
+        self.packages[path] = entry
 
-    def read(self, path: str) -> tuple[_DynamoCacheEntry, _Backends]:
+    def read(self, path: str) -> PrecompileCacheEntry:
         """
         Read dynamo cache entry and backends from memory.
         """
@@ -859,34 +975,32 @@ def clear(self) -> None:
 
     def write(
         self,
-        dynamo: _DynamoCacheEntry,
-        backends: _Backends,
+        entry: PrecompileCacheEntry,
         path: str,
     ) -> None:
         """
         Write dynamo cache entry and backends to disk.
         """
+        from torch._inductor.codecache import write_atomic
+
         path = os.path.join(self.path_prefix, path) if self.path_prefix else path
         try:
             os.makedirs(path, exist_ok=True)
-            with open(os.path.join(path, "dynamo"), "wb") as dynamo_path:
-                pickle.dump(dynamo, dynamo_path)
-            with open(os.path.join(path, "backends"), "wb") as backend_path:
-                pickle.dump(backends, backend_path)
+            pickled_content: bytes = pickle.dumps(entry)
+            write_atomic(os.path.join(path, "entry"), pickled_content)
         except Exception as e:
             raise RuntimeError(f"Failed to save package to {path}: {e}") from e
 
-    def read(self, path: str) -> tuple[_DynamoCacheEntry, _Backends]:
+    def read(self, path: str) -> PrecompileCacheEntry:
         """
         Read dynamo cache entry and backends from disk.
         """
         path = os.path.join(self.path_prefix, path) if self.path_prefix else path
         try:
-            with open(os.path.join(path, "dynamo"), "rb") as dynamo_path:
-                cache_entry = pickle.load(dynamo_path)
-            with open(os.path.join(path, "backends"), "rb") as backend_path:
-                backend_content = pickle.load(backend_path)
-            return cache_entry, backend_content
+            with open(os.path.join(path, "entry"), "rb") as f:
+                pickled_content = f.read()
+                entry = pickle.loads(pickled_content)
+                return entry
         except Exception as e:
             raise RuntimeError(f"Failed to load package from path {path}: {e}") from e
 
@@ -905,9 +1019,7 @@ def save(self, package: CompilePackage) -> None:
         logger.info("Saving CompilePackage for %s", package.source_id)
         super().save_package(package, key)
 
-    def load(
-        self, fn: Callable[..., Any]
-    ) -> Optional[tuple[_DynamoCacheEntry, dict[_BackendId, Any]]]:
+    def load(self, fn: Callable[..., Any]) -> Optional[PrecompileCacheEntry]:
         """
         Loads a package from a given path and returns it plus a list of deserialized backends
         """
@@ -934,10 +1046,15 @@ def load_and_install_package(
         if results is None:
             return None
         else:
-            (entry, backends) = results
-            package = CompilePackage(fn, entry)
-            package.install(backends)
+            package = CompilePackage(fn, results.dynamo)
+            package.install(results.backends)
             return package
 
 
+def cache_dir() -> str:
+    from torch._inductor.runtime.cache_dir_utils import cache_dir
+
+    return cache_dir()
+
+
 DynamoCache = DiskDynamoCache(os.path.join(cache_dir(), "dynamo"))
diff --git a/torch/_dynamo/polyfills/itertools.py b/torch/_dynamo/polyfills/itertools.py
index 2b64327b93de9..ff87743d5be50 100644
--- a/torch/_dynamo/polyfills/itertools.py
+++ b/torch/_dynamo/polyfills/itertools.py
@@ -6,7 +6,6 @@
 
 import itertools
 import operator
-import sys
 from typing import Callable, Optional, overload, TYPE_CHECKING, TypeVar
 from typing_extensions import TypeAlias
 
@@ -28,6 +27,7 @@
     "islice",
     "tee",
     "zip_longest",
+    "pairwise",
 ]
 
 
@@ -163,20 +163,16 @@ def islice(iterable: Iterable[_T], /, *args: int | None) -> Iterator[_T]:
 
 
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.pairwise
-if sys.version_info >= (3, 10):
-
-    @substitute_in_graph(itertools.pairwise, is_embedded_type=True)  # type: ignore[arg-type]
-    def pairwise(iterable: Iterable[_T], /) -> Iterator[tuple[_T, _T]]:
-        a = None
-        first = True
-        for b in iterable:
-            if first:
-                first = False
-            else:
-                yield a, b  # type: ignore[misc]
-            a = b
-
-    __all__ += ["pairwise"]
+@substitute_in_graph(itertools.pairwise, is_embedded_type=True)  # type: ignore[arg-type]
+def pairwise(iterable: Iterable[_T], /) -> Iterator[tuple[_T, _T]]:
+    a = None
+    first = True
+    for b in iterable:
+        if first:
+            first = False
+        else:
+            yield a, b  # type: ignore[misc]
+        a = b
 
 
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.tee
diff --git a/torch/_dynamo/precompile_context.py b/torch/_dynamo/precompile_context.py
index 38f97e583375d..d8bc7b95ec757 100644
--- a/torch/_dynamo/precompile_context.py
+++ b/torch/_dynamo/precompile_context.py
@@ -1,22 +1,18 @@
 import copy
+import json
 import logging
-import pickle
 from abc import abstractmethod
 from collections import defaultdict
-from itertools import chain
-from typing import Any, Callable, Generic, Optional, TypeVar, Union
-from typing_extensions import override
-
-from torch.compiler._cache import (
-    _serialize_single_cache,
-    CacheArtifact,
-    CacheArtifactFactory,
-    CacheArtifactManager,
-    CacheArtifactsResult,
-    CacheInfo,
+from dataclasses import dataclass
+from typing import Any, Callable, Generic, Optional, TypeVar
+
+import torch
+from torch._dynamo.package import (
+    _BackendId,
+    _DynamoCacheEntry,
+    DynamoCache,
+    PrecompileCacheEntry,
 )
-from torch.utils._appending_byte_serializer import AppendingByteSerializer
-from torch.utils._ordered_set import OrderedSet
 
 
 """
@@ -27,14 +23,12 @@
 logger = logging.getLogger(__name__)
 
 
-class PrecompileCacheArtifact(CacheArtifact, Generic[T]):
+@dataclass
+class BackendCacheArtifact(Generic[T]):
     """
-    Data for each cache artifact that will be serialized and deserialized by
-    PrecompileContext, rather than CacheArtifactManager.
-    T represents the deserialized type of the artifact, i.e. the return type of after_deserialization
-
-    PrecompileCacheArtifact is a frozen dataclass - you can add new serializable fields and metadata specific to your own artifacts
-    as needed, and use them in after_deserialization.
+    Represents a single serializable backend artifact from a dynamo backend.
+    Each BackendCacheArtifact has a key associated with it along with some
+    serializable content.
 
     Example implementation:
 
@@ -48,13 +42,8 @@ def after_deserialization(self) -> MySerializableType:
             return result
     """
 
-    @override
-    def populate_cache(self) -> None:
-        raise RuntimeError("Precompile cache artifacts do not populate caches")
-
-    @override
-    def precompile_compatible(self) -> bool:
-        return True
+    key: str
+    content: Any
 
     @abstractmethod
     def after_deserialization(self) -> T:
@@ -64,182 +53,192 @@ def after_deserialization(self) -> T:
         """
         ...
 
-
-class EditablePrecompileCacheArtifact(Generic[T]):
-    """
-    A PrecompileCacheArtifact whose content isn't encoded until we call PrecompileContext.serialize()
-    """
-
-    def __init__(self, artifact_type: str, content: Any, key: str) -> None:
-        # Deepcopy the content for now, but don't pickle it yet.
-        # This allows us to make changes to self.content before true serialization
-        self.content = copy.deepcopy(content)
-        self.key = key
-        self.artifact_type = artifact_type
-
-    def real_encode(self) -> PrecompileCacheArtifact[T]:
-        """
-        Actually encode the object
-        """
-        content = pickle.dumps(self.content)
-        artifact = CacheArtifactFactory.encode_create(
-            self.artifact_type, self.key, content
-        )
-        assert isinstance(artifact, PrecompileCacheArtifact)
-        return artifact
-
     def edit_contents(self, edit_fn: Callable[..., Any]) -> None:
         """
-        Edit the content of an existing artifact
+        Edit the contents of the artifact.
         """
         self.content = edit_fn(self.content)
 
 
-class PrecompileContext(CacheArtifactManager):
+class EagerCacheArtifact(BackendCacheArtifact[Any]):
+    def after_deserialization(self) -> Any:
+        return self.content
+
+
+class BypassDynamoCacheEntry(Exception):
+    pass
+
+
+class PrecompileContext:
     """
     PrecompileContext is a special CacheArtifactManager for handling precompilation
     It uses the same interface as CacheArtifactManager, but handles deserialization differently: instead
     of placing each artifact into respective caches, it will stitch all the cache artifacts for a single key
     together and place it into a global Precompile Cache.
 
+    PrecompileContext has two main portions: dynamo_cache_entries and backend_cache_artifacts.
+    When saving, PrecompileContext.serialize() will serialize all dynamo cache entries along with any PrecompileCacheArtifacts that
+    are needed to save those dynamo cache entries.
+
     The following artifact types are supported by PrecompileContext:
      - BundledAOTAutogradCacheArtifact
-     - DynamoCodeStateArtifact
-     - AutotuneCacheArtifact (regular autotune results, same as Megacache)
+
     """
 
     # Protected by the compile_lock
-    # _new_cache_artifacts_by_key organizes results by the key of each artifact.
-    # This allows us to implement serialize_by_key easily.
-    # On call to `serialize()`, all cache artifacts in _new_cache_artifacts_by_key
-    # are transferred to _new_cache_artifacts before serialization.
-    _new_cache_artifacts_by_key: dict[
-        str, Union[EditablePrecompileCacheArtifact[object], CacheArtifact]
-    ] = {}
-    _new_cache_artifacts: CacheArtifactsResult = defaultdict(list)
-    # Keep a separate seen artifacts list to make avoid unnecessary duplicates
-    # This list will not be cleared between serialize() calls
-    _seen_artifacts: OrderedSet[CacheArtifact] = OrderedSet()
-    # When serialize() is called, artifacts are transferred from _cache_artifacts to
-    # internal data structure of the _serializer
-    # This allows us to only pay the cost of serialization if serialize() is called
-    _serializer: AppendingByteSerializer[tuple[str, list[CacheArtifact]]] = (
-        AppendingByteSerializer(serialize_fn=_serialize_single_cache)
-    )
-    _cache_info: CacheInfo = CacheInfo()
+    # _backend_artifacts_by_key organizes results by the key of each artifact.
+    # Each object here must be serializable
+    _backend_artifacts_by_key: dict[str, BackendCacheArtifact[Any]] = {}
+
+    # On call to `serialize()`, all cache artifacts in _dynamo_cache_entries are converted
+    # into DynamoCacheArtifacts and added to _new_cache_artifacts for serialization
+    _dynamo_cache_entries: dict[str, _DynamoCacheEntry] = {}
 
     @classmethod
     def clear(cls) -> None:
-        cls._new_cache_artifacts_by_key.clear()
-        super().clear()
+        cls._backend_artifacts_by_key.clear()
+        cls._dynamo_cache_entries.clear()
 
-    @override
     @classmethod
     def record_artifact(
         cls,
-        artifact_type: str,
-        key: str,
-        content: Any,
-        editable: bool = False,
+        artifact: BackendCacheArtifact[Any],
     ) -> None:
         """
-        Called from each caching operation to record the artifact in this
-        "mega" list
+        Records a backend artifact to be used with dynamo cache entries
         """
-        artifact: Union[EditablePrecompileCacheArtifact[object], CacheArtifact]
-        if editable:
-            artifact = EditablePrecompileCacheArtifact(artifact_type, content, key)
-        else:
-            artifact = CacheArtifactFactory.encode_create(artifact_type, key, content)
-            # TODO: although this covers completely same artifacts, it's possible
-            # with AOTAutogradCacheEntries to have multiple artifacts whose keys
-            # (i.e. backend_ids) are different, but whose contents are equal.
-            # In those cases, it would be much better if we only serialize once instead
-            # of N times.
-            if artifact in cls._seen_artifacts:
-                return
-            cls._seen_artifacts.add(artifact)
-
-        cls._new_cache_artifacts_by_key[key] = artifact
+        cls._backend_artifacts_by_key[artifact.key] = copy.deepcopy(artifact)
 
     @classmethod
-    def _save_artifacts_by_type(cls) -> None:
-        """
-        We normally record artifacts by key, but serialization expects them to be organized
-        by artifact type. This function transfers artifacts from _new_cache_artifacts_by_key to _new_cache_artifacts
-        """
-        for artifact in cls._new_cache_artifacts_by_key.values():
-            if isinstance(artifact, EditablePrecompileCacheArtifact):
-                artifact = artifact.real_encode()
-            cls._new_cache_artifacts[artifact.__class__.type()].append(artifact)
-        cls._new_cache_artifacts_by_key.clear()
+    def record_dynamo_cache_entry(
+        cls, cache_entry: _DynamoCacheEntry, key: str
+    ) -> None:
+        cls._dynamo_cache_entries[key] = cache_entry
 
     @classmethod
     def edit_artifact(cls, key: str, edit_fn: Callable[..., Any]) -> None:
         """
         Edit the content of an existing artifact
         """
-        assert key in cls._new_cache_artifacts_by_key, (
-            f"Key {key} not found in artifacts"
-        )
-        artifact = cls._new_cache_artifacts_by_key[key]
-        assert isinstance(artifact, EditablePrecompileCacheArtifact), (
-            "Artifact is not editable"
-        )
+        assert key in cls._backend_artifacts_by_key, f"Key {key} not found in artifacts"
+        artifact = cls._backend_artifacts_by_key[key]
         artifact.edit_contents(edit_fn)
 
     @classmethod
-    def serialize_artifact_by_key(cls, key: str) -> Optional[CacheArtifact]:
+    def serialize_artifact_by_key(cls, key: str) -> Optional[BackendCacheArtifact[Any]]:
         """
-        Serialize all artifacts with the given key returned in a list.
+        Return the backend cache artifact with the associated key
         """
-        result = cls._new_cache_artifacts_by_key.get(key, None)
-        if isinstance(result, EditablePrecompileCacheArtifact):
-            result = result.real_encode()
-        return result
-
-    @classmethod
-    def serialize(cls) -> Optional[tuple[bytes, CacheInfo]]:
-        cls._save_artifacts_by_type()
-        # No need to serialize if there are no new dynamo compiles
-        if "precompile_dynamo" not in cls._new_cache_artifacts:
-            return None
-        return super().serialize()
+        return cls._backend_artifacts_by_key.get(key, None)
 
     @staticmethod
-    def populate_caches(artifacts: CacheArtifactsResult) -> CacheInfo:
-        PrecompileContext._ensure_cache_artifacts_registered()
-
-        artifacts_by_key = {}
-        cache_info = CacheInfo()
-        for artifact in chain(*artifacts.values()):
-            if artifact.type() == "autotune":
-                # Populate autotune cache artifacts
-                artifact.populate_cache()
-            else:
-                artifacts_by_key[artifact.key] = artifact
-            cache_info.add(artifact)
-
-        from torch._dynamo.package import _BackendId, DynamoCache
-
-        for dynamo_entry in artifacts["precompile_dynamo"]:
-            assert isinstance(dynamo_entry, PrecompileCacheArtifact)
-            cache_entry = dynamo_entry.after_deserialization()
-            # Grab backends from the dynamo cache entry
-            backends = cache_entry.backend_ids
-            backend_content: dict[_BackendId, PrecompileCacheArtifact[Any]] = {}
-            for id_ in backends:
-                assert id_ in artifacts_by_key, f"Backend {id_} not found in artifacts"
-                artifact = artifacts_by_key[id_]
-                assert isinstance(artifact, PrecompileCacheArtifact)
-                backend_content[id_] = artifact
-            DynamoCache.write(cache_entry, backend_content, dynamo_entry.key)
-
-        return cache_info
+    def dump_debug_info(
+        dynamo_entries: dict[str, _DynamoCacheEntry],
+        backend_artifacts: dict[str, BackendCacheArtifact[Any]],
+    ) -> dict[str, Any]:
+        """
+        Return a JSON serializable debug dump of all entries in the precompile context
+        Called in serialize before serialization, and in populate_caches after deserialization
+        """
+        # Print debug information
+        debug_info: defaultdict[str, list[Any]] = defaultdict(list)
+        for key, cache_entry in dynamo_entries.items():
+            info = cache_entry.debug_info()
+            info["key"] = key
+            debug_info["dynamo"].append(info)
+
+        for artifact in backend_artifacts.values():
+            debug_info["backends"].append(artifact.key)
+
+        return debug_info
 
     @classmethod
-    def _ensure_cache_artifacts_registered(cls) -> None:
-        from torch._dynamo.package import _DynamoCacheArtifact  # noqa: F401
-        from torch._functorch._aot_autograd.autograd_cache import (  # noqa: F401
-            BundledAOTAutogradCacheArtifact,
+    def save_to_dynamo_cache(cls) -> dict[str, Any]:
+        precompile_cache_entries, debug_info = cls.create_cache_entries()
+        for key, entry in precompile_cache_entries.items():
+            DynamoCache.write(entry, key)
+        return debug_info
+
+    @classmethod
+    def create_cache_entries(
+        cls,
+    ) -> tuple[dict[str, PrecompileCacheEntry], dict[str, Any]]:
+        """
+        Grabs all the cache entries in the precompile context and
+        stitches them together into full PrecompileCacheEntries.
+        """
+        dynamo_entries = cls._dynamo_cache_entries
+        backend_artifacts = cls._backend_artifacts_by_key
+
+        num_artifacts = len(dynamo_entries)
+
+        debug_info = PrecompileContext.dump_debug_info(
+            dynamo_entries, backend_artifacts
+        )
+        debug_str = json.dumps(
+            {
+                "num_entries": num_artifacts,
+                "artifacts": debug_info,
+            },
         )
+        torch._logging.trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "dynamo_cache_entries",
+                "encoding": "json",
+            },
+            payload_fn=lambda: debug_str,
+            expect_trace_id=False,
+        )
+
+        precompile_cache_entries = {}
+
+        for key, cache_entry in dynamo_entries.items():
+            try:
+                backends = cache_entry.backend_ids
+                backend_content: dict[_BackendId, BackendCacheArtifact[Any]] = {}
+                for id_ in backends:
+                    if id_ not in backend_artifacts:
+                        debug_str = json.dumps(
+                            {
+                                "entry": cache_entry.debug_info,
+                                "key": key,
+                            }
+                        )
+                        logger.warning("Backend not found")
+                        torch._logging.trace_structured(
+                            "artifact",
+                            metadata_fn=lambda: {
+                                "name": "dynamo_cache_bypass",
+                                "encoding": "json",
+                            },
+                            payload_fn=lambda: debug_str,
+                            expect_trace_id=False,
+                        )
+                        continue
+                    artifact = backend_artifacts[id_]
+                    assert isinstance(artifact, BackendCacheArtifact)
+                    backend_content[id_] = artifact
+                precompile_cache_entries[key] = PrecompileCacheEntry(
+                    dynamo=cache_entry, backends=backend_content
+                )
+            except Exception as e:
+                logger.warning("Failed to create cache entry %s: %s", key, str(e))
+
+                error = e
+                data = json.dumps(
+                    {
+                        "key": key,
+                        "error": str(error),
+                    }
+                )
+                torch._logging.trace_structured(
+                    "artifact",
+                    metadata_fn=lambda: {
+                        "name": "dynamo_cache_exception",
+                        "encoding": "json",
+                    },
+                    payload_fn=lambda: data,
+                )
+                continue
+        return precompile_cache_entries, debug_info
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index c1906eeee710c..559972464f82d 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -526,6 +526,29 @@ def name(self) -> str:
         return f"cast_symbool_to_symint_guardless({self.base.name()})"
 
 
+@dataclasses.dataclass(frozen=True)
+class DynamicScalarSource(ChainedSource):
+    is_int: bool
+
+    def __post_init__(self) -> None:
+        assert self.base is not None
+
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        # Integer casting at reconstruction helps reduce the amount of DynamicInts returned
+        # to the user, in favor of plain ints.
+        # For example, a compiled region that only does int arithmetic could return a
+        # DynamicInt without the casting here.
+        codegen.add_push_null(lambda: codegen.load_import_from("builtins", "int"))
+        codegen(self.base)
+        codegen.extend_output(create_call_function(1, False))
+
+    def guard_source(self) -> GuardSource:
+        return self.base.guard_source()
+
+    def name(self) -> str:
+        return f"int({self.base.name()})"
+
+
 @dataclasses.dataclass(frozen=True)
 class FlattenScriptObjectSource(ChainedSource):
     def __post_init__(self) -> None:
@@ -697,7 +720,7 @@ def is_dict_key(self) -> bool:
 # Used to access an item from the dictionary
 @dataclasses.dataclass(frozen=True)
 class DictGetItemSource(ChainedSource):
-    # Key to access in the dictionary. It can be one of the the following types
+    # Key to access in the dictionary. It can be one of the following types
     # 1) ConstDictKeySource
     # 2) constant - like string, integer
     index: Any
@@ -734,7 +757,7 @@ def name(self) -> str:
 # torch.compile does not run the overridden __getitem__ method
 @dataclasses.dataclass(frozen=True)
 class DictSubclassGetItemSource(ChainedSource):
-    # Key to access in the dictionary. It can be one of the the following types
+    # Key to access in the dictionary. It can be one of the following types
     # 1) ConstDictKeySource
     # 2) constant - like string, integer
     index: Any
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 4dd1321a5057d..76bb976706f5e 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -167,7 +167,7 @@
     PythonModuleVariable,
     UnknownVariable,
 )
-from .variables.nn_module import NNModuleVariable
+from .variables.nn_module import NNModuleVariable, UnspecializedNNModuleVariable
 from .variables.tensor import supported_comparison_ops, SymNodeVariable, TensorVariable
 from .variables.torch_function import (
     SymbolicTorchFunctionState,
@@ -465,9 +465,6 @@ def impl(self: InstructionTranslator, inst: Instruction) -> None:
 
 
 def is_stdlib(mod: object) -> bool:
-    if sys.version_info < (3, 10):
-        # For < 3.10, no easy way to identify a stdlib module name.
-        return False
     if not isinstance(mod, types.ModuleType):
         return False
     return mod.__name__.split(".")[0] in sys.stdlib_module_names
@@ -1055,7 +1052,7 @@ class ExceptionStack:
     """
 
     # Exception handling in CPython is a bit confusing and some of the bytecode
-    # have a slightly different behavior than what is is documented. While reading
+    # have a slightly different behavior than what is documented. While reading
     # the documentation, is important to notice that the terms "current exception"
     # and "stack" sometimes refers to a C variable with the same name and the
     # exception stack, respectively.
@@ -3290,15 +3287,7 @@ def LOAD_ASSERTION_ERROR(self, inst: Instruction) -> None:
         self.push(self.load_builtin_from_argval("AssertionError"))
 
     def LOAD_BUILD_CLASS(self, inst: Instruction) -> None:
-        unimplemented_v2(
-            gb_type="LOAD_BUILD_CLASS bytecode not supported",
-            context="",
-            explanation="Dynamo does not support tracing classes that are defined in the compiled region.",
-            hints=[
-                "Move the class definition out of the compiled region.",
-                *graph_break_hints.SUPPORTABLE,
-            ],
-        )
+        self.push(self.load_builtin_from_argval("__build_class__"))
 
     UNARY_POSITIVE = stack_op(operator.pos)
     UNARY_NEGATIVE = stack_op(operator.neg)
@@ -3777,18 +3766,17 @@ def __init__(
 
         self.package = package
 
-        if sys.version_info >= (3, 10):
-            from .resume_execution import (
-                CO_ASYNC_GENERATOR,
-                CO_COROUTINE,
-                CO_GENERATOR,
-                CO_ITERABLE_COROUTINE,
-            )
+        from .resume_execution import (
+            CO_ASYNC_GENERATOR,
+            CO_COROUTINE,
+            CO_GENERATOR,
+            CO_ITERABLE_COROUTINE,
+        )
 
-            if f_code.co_flags & (
-                CO_GENERATOR | CO_COROUTINE | CO_ITERABLE_COROUTINE | CO_ASYNC_GENERATOR
-            ):
-                self.push(BuiltinVariable(None))
+        if f_code.co_flags & (
+            CO_GENERATOR | CO_COROUTINE | CO_ITERABLE_COROUTINE | CO_ASYNC_GENERATOR
+        ):
+            self.push(BuiltinVariable(None))
 
         self.inline_depth = inline_depth
         self.inconsistent_side_effects = False
@@ -3855,6 +3843,7 @@ def __init__(
                 global_scope=f_globals,
                 f_code=f_code,
                 torch_function_mode_stack=torch_function_mode_stack,
+                one_graph=one_graph,
                 package=package,
             ),
             instructions=instructions,
@@ -4279,6 +4268,15 @@ def get_trace_call_log_str() -> str:
                 code_context.get_context(module.forward.__code__)[
                     "orig_graphmodule"
                 ] = weakref.ref(module)
+        # When we have inline_nn_module turned on, modules resolve to UnspecializedNNModuleVariable
+        if args and isinstance(args[0], UnspecializedNNModuleVariable):
+            module = args[0].value
+            if isinstance(module, torch.fx.GraphModule):
+                # The inline call might not actually be a call to `forward`,
+                # but it is enough to add a context for `forward` in case it is called.
+                code_context.get_context(module.forward.__code__)[
+                    "orig_graphmodule"
+                ] = weakref.ref(module)
 
         tracer: InliningInstructionTranslator
         if is_generator(code):
@@ -4392,7 +4390,7 @@ def __init__(
         # because we dont mutate them in transform_code_object (those
         # instructions are for the top most Instruction translator).  Also, we
         # have to be careful about not using _cached_cleaned_instructions here
-        # because that function is global, while we want the the cache to be
+        # because that function is global, while we want the cache to be
         # alive only during a compmilation.
         tracing_ctx = parent.output.tracing_context
         instructions = None
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 805c3be524e8f..1ce88f1d744c3 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -517,13 +517,6 @@ def skipIfPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     return fn
 
 
-def requiresPy310(fn: Callable[_P, _T]) -> Callable[_P, _T]:
-    if sys.version_info >= (3, 10):
-        return fn
-    else:
-        return unittest.skip("Requires Python 3.10+")(fn)
-
-
 # Controls tests generated in test/inductor/test_torchinductor_dynamic_shapes.py
 # and test/dynamo/test_dynamic_shapes.py
 def expectedFailureDynamic(fn: Callable[_P, _T]) -> Callable[_P, _T]:
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index 47ad8cda0c974..e132a93e80cb4 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -684,6 +684,7 @@
         "torch._C._get_mem_efficient_sdp_enabled",
         "torch._C._get_mkldnn_enabled",
         "torch._C._get_cudnn_sdp_enabled",
+        "torch._C._get_overrideable_sdp_enabled",
         "torch._C._set_sdp_use_cudnn",
         "torch._C._get_mobile_model_contained_types_from_buffer",
         "torch._C._get_mobile_model_contained_types",
@@ -1220,6 +1221,7 @@
         "torch._C._set_sdp_use_math",
         "torch._C._set_math_sdp_allow_fp16_bf16_reduction",
         "torch._C._set_sdp_use_mem_efficient",
+        "torch._C._set_sdp_use_overrideable",
         "torch._C._set_should_use_format_with_string_table",
         "torch._C._set_sm_carveout_experimental",
         "torch._C._set_storage_access_error_msg",
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 058a66cf5b772..063a33833fe88 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -709,6 +709,7 @@ def _foo(...):
     if key not in compilation_time_metrics:
         compilation_time_metrics[key] = []
 
+    metrics = compilation_time_metrics[key]
     event_metadata = {}
     if metadata:
         event_metadata.update(metadata)
@@ -756,7 +757,7 @@ def _foo(...):
     finally:
         end_ns = time.time_ns()
         time_spent_ns = end_ns - start_ns
-        compilation_time_metrics[key].append(time_spent_ns / 1e9)
+        metrics.append(time_spent_ns / 1e9)
         chromium_log.log_event_end(
             event_name, end_ns, {}, start_ns, log_pt2_compile_event, compile_id
         )
@@ -4695,7 +4696,18 @@ def get_user_object_from_id(obj_id: int) -> Any:
 
 def store_user_object_weakref(obj: object) -> None:
     obj_id = id(obj)
-    user_obj_id_to_weakref[obj_id] = weakref.ref(obj)
+    try:
+        user_obj_id_to_weakref[obj_id] = weakref.ref(obj)
+    except TypeError as e:
+        from .exc import unimplemented_v2
+
+        unimplemented_v2(
+            gb_type="Failed to make weakref to User Object",
+            context=f"user_objected: {obj}",
+            explanation="Object does not allow us to make a weakref to it",
+            hints=[],
+            from_exc=e,
+        )
 
 
 class CompileTimeInstructionCounter:
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 20b88759ef324..bac87c27dbdd6 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -60,6 +60,7 @@
 from torch._utils_internal import justknobs_check
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental._dynamism import normalize_source_name
+from torch.fx.experimental.sym_node import _DynamicScalar, DynamicInt
 from torch.fx.experimental.symbolic_shapes import (
     _constrain_range_for_size,
     _nested_int_aware_sort,
@@ -101,6 +102,7 @@
     ConvertIntSource,
     DictGetItemSource,
     DictSubclassGetItemSource,
+    DynamicScalarSource,
     FloatTensorSource,
     GetItemSource,
     GradSource,
@@ -206,7 +208,10 @@
     UserMethodVariable,
     WrapperUserFunctionVariable,
 )
-from .higher_order_ops import TorchHigherOrderOperatorVariable
+from .higher_order_ops import (
+    LocalMapWrappedHigherOrderVariable,
+    TorchHigherOrderOperatorVariable,
+)
 from .iter import ItertoolsVariable
 from .lazy import LazyVariableTracker
 from .lists import (
@@ -453,7 +458,9 @@ def _is_deduplicable_sym_variable(value, vt):
             # should NOT track them. If we use a single SymNodeVariable instance to track them
             # across multiple uses, then guards created for one usage will incorrectly apply to
             # all other usages of that constant, leading to unnecessary recompilations.
-            return is_torch_sym(value) and isinstance(vt, SymNodeVariable)
+            return (
+                is_torch_sym(value) or isinstance(value, _DynamicScalar)
+            ) and isinstance(vt, SymNodeVariable)
 
         if (
             (
@@ -850,6 +857,8 @@ def build_key_value(i, k, v):
             return build_checkpoint_variable(source=self.source)
         elif is_invoke_subgraph(value):
             return build_invoke_subgraph_variable(source=self.source)
+        elif LocalMapWrappedHigherOrderVariable.should_wrap_in_hop(value):
+            return LocalMapWrappedHigherOrderVariable.build(source=self.source)
         elif isinstance(value, functools.partial):
             func_src = AttrSource(self.get_source(), "func")
             func_obj = VariableBuilder(self.tx, func_src)(value.func)
@@ -1098,6 +1107,46 @@ def build_key_value(i, k, v):
         ):
             self.install_guards(GuardBuilder.FUNCTION_MATCH)
             return ItertoolsVariable(value, source=self.source)
+        elif isinstance(value, _DynamicScalar):
+            is_int = isinstance(value, DynamicInt)
+            source = DynamicScalarSource(self.source, is_int)
+            if id(value) in self.tx.output.root_tracer.dynamic_scalar_nodes:
+                # If we've already seen this dynamic scalar, reuse the existing
+                # SymInt/SymFloat node.
+                node = self.tx.output.root_tracer.dynamic_scalar_nodes[id(value)]
+            else:
+                sym = self.tx.output.shape_env.create_unspecified_symbol(
+                    value.real,
+                    source=source,
+                    dynamic_dim=DimDynamic.DYNAMIC,
+                )
+                node = self.tx.output.shape_env.create_symintnode(
+                    sym,
+                    hint=value.real,
+                    source=source,
+                )
+
+            # Bind to graph input
+            sym_node_proxy = self.tx.output.root_tracer.create_graph_input(
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+                type(node),
+                node,
+                source=source,
+            )
+            sym_node_proxy.node.meta["grapharg"] = GraphArg(
+                source,
+                node,
+                False,
+                None,
+                is_tensor=False,
+                example_strong_ref=node,
+            )
+            sym_expr = node.node.expr
+            assert isinstance(sym_expr, sympy.Symbol), (
+                f"{sym_expr} is not a basic Symbol."
+            )
+            self.tx.output.tracked_fakes.append(TrackedFake(node, source, None))
+            return SymNodeVariable(sym_node_proxy, node)
         elif is_torch_sym(value):
             # Note: this doesn't handle nested symints.
             # For SymBool input, we reuse the infra for SymInt by simulating SymBool with a SymInt in dynamo.
@@ -2984,6 +3033,11 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
             torch.backends.cuda.is_flash_attention_available,
             torch.backends.cuda.can_use_flash_attention,
             torch.backends.cuda.can_use_efficient_attention,
+            torch._C._get_cudnn_sdp_enabled,
+            torch._C._get_flash_sdp_enabled,
+            torch._C._get_mem_efficient_sdp_enabled,
+            torch._C._get_math_sdp_enabled,
+            torch._C._get_overrideable_sdp_enabled,
             "is_integer",
         ]
         + list(supported_const_comparison_op_values.keys())
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index b46707f2f1172..1bf8af7e5edbc 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -28,7 +28,6 @@
 import logging
 import math
 import operator
-import sys
 import types
 import typing
 import unittest
@@ -2115,9 +2114,7 @@ def check_type(ty):
             getattr(isinstance_type, "__instancecheck__", None)
         ):
             isinstance_type_tuple = (isinstance_type,)
-        elif sys.version_info >= (3, 10) and isinstance(
-            isinstance_type, types.UnionType
-        ):
+        elif isinstance(isinstance_type, types.UnionType):
             isinstance_type_tuple = isinstance_type.__args__
         elif isinstance(isinstance_type, tuple) and all(
             isinstance(tp, type) or callable(getattr(tp, "__instancecheck__", None))
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index c33979aae07df..1691c4161889b 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -68,7 +68,9 @@ def raise_unhashable(arg, tx=None):
 
         tx = InstructionTranslator.current_tx()
     raise_observed_exception(
-        TypeError, tx, args=[ConstantVariable(f"unhashable type: {type(arg)}")]
+        TypeError,
+        tx,
+        args=[ConstantVariable(f"unhashable type: {type(arg.realize())}")],
     )
 
 
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index 5ac883c7d3932..41a22972cba9a 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -418,7 +418,7 @@ def unspecialize_carried_inputs(tx, carry) -> VariableTracker:
         source_target=self.value,
         # NOTE [why we cannot use "automatic" for while_loop]:
         # The reason is that we want to enforce
-        # the ordering of inputs and outputs to be consistent and the the ordering
+        # the ordering of inputs and outputs to be consistent and the ordering
         # of cond_fn and body_fn to the consistent.
         # e.g. suppose we use "automatic" and we have:
         #
@@ -3383,6 +3383,7 @@ def _call_function(
             lambda a: a.node.meta["example_value"],
             body_r.as_proxy(),
         )
+
         p_kwargs = {key: value.as_proxy() for key, value in kwargs.items()}
         return _call_function_and_unflatten_output(
             tx, self.value, p_args, p_kwargs, flat_example_value, treespec
@@ -3497,6 +3498,115 @@ def _call_function(
         )
 
 
+class LocalMapWrappedHigherOrderVariable(WrapHigherOrderVariable):
+    supports_input_mutation = False
+    supports_aliasing = False
+
+    # Subclasses aren't supported by speculate_subgraph yet
+    # So this HOP is only usable with plain tensors
+    _enabled = False
+
+    @classmethod
+    @contextlib.contextmanager
+    def enable(cls):
+        """Context manager to temporarily enable local map wrapping.
+        Will be removed when speculate_subgraph supports subclass inputs:
+        https://github.com/pytorch/pytorch/issues/161456.
+
+        Usage:
+            with LocalMapWrappedHigherOrderVariable.enable_wrapping():
+                # Code where should_wrap_in_hop will return True
+                pass
+        """
+        old_value = cls._enabled
+        cls._enabled = True
+        try:
+            yield
+        finally:
+            cls._enabled = old_value
+
+    @classmethod
+    def should_wrap_in_hop(cls, value):
+        if not torch.distributed.is_available():
+            return False
+
+        from torch.distributed.tensor.experimental._func_map import _local_map_wrapped
+
+        # check is important to avoid subclass dispatch
+        if type(value) != type(_local_map_wrapped):
+            return False
+
+        return value == _local_map_wrapped and cls._enabled
+
+    @staticmethod
+    def build(**options):
+        return TorchHigherOrderOperatorVariable.make(
+            torch._higher_order_ops.local_map_hop,
+            **options,
+        )
+
+    def python_type(self):
+        return type(self.value)
+
+    def _call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        """
+        Goal of this function is to rewrite local_map usage as a HOP:
+            local_map(func, ...) -> local_map_hop(gm, ...)
+        """
+
+        (
+            user_func,
+            out_placements,
+            in_placements,
+            in_grad_placements,
+            device_mesh,
+            redistribute_inputs,
+            *user_args,
+        ) = args
+
+        (
+            p_args,
+            p_kwargs,
+            example_value,
+            body_r,
+            treespec,
+            body_gmod,
+            body_name,
+        ) = self.create_wrapped_node(
+            tx, user_func, user_args, kwargs, self.value._name, subgraph_name="subgraph"
+        )
+
+        # Treat as const, so we don't have to deal with Placement types in fx IR
+        # Guarded with EQUALS_MATCH on local_map call's arguments
+        body_gmod.meta["local_map_kwargs"] = {
+            "out_placements": out_placements.value,
+            "in_placements": in_placements.value,
+            "redistribute_inputs": redistribute_inputs.value,
+            "in_grad_placements": in_grad_placements.value,
+            "device_mesh": device_mesh.value,
+        }
+
+        assert len(p_kwargs) == 0
+
+        flat_example_value = pytree.tree_map_only(
+            torch.fx.Proxy,
+            lambda a: a.node.meta["example_value"],
+            body_r.as_proxy(),
+        )
+
+        p_kwargs = {key: value.as_proxy() for key, value in kwargs.items()}
+        out = _call_function_and_unflatten_output(
+            tx, self.value, p_args, p_kwargs, flat_example_value, treespec
+        )
+
+        return out
+
+
 # Map operator names to their corresponding variable for fast TorchHigherOrderOperatorVariable.make()
 _hop_name_to_variable_class = {
     "cond": CondHigherOrderVariable,
@@ -3525,4 +3635,5 @@ def _call_function(
     "auto_functionalized_v2": AutoFunctionalizeHigherOrderVariable,
     "invoke_subgraph": InvokeSubgraphHigherOrderVariable,
     "custom_function_call": CustomFunctionHigherOrderOperatorVariable,
+    "local_map_hop": LocalMapWrappedHigherOrderVariable,
 }
diff --git a/torch/_dynamo/variables/iter.py b/torch/_dynamo/variables/iter.py
index 80b9915aaa217..cff52901126fd 100644
--- a/torch/_dynamo/variables/iter.py
+++ b/torch/_dynamo/variables/iter.py
@@ -16,7 +16,6 @@
 """
 
 import itertools
-import sys
 from typing import TYPE_CHECKING, Union
 
 from .. import graph_break_hints, polyfills, variables
@@ -442,17 +441,14 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen.append_output(
             create_instruction("BUILD_TUPLE", arg=len(self.iterables))
         )
-        if sys.version_info >= (3, 10):
-            codegen.extend_output(
-                [
-                    codegen.create_load_const("strict"),
-                    codegen.create_load_const(self.strict),
-                    create_instruction("BUILD_MAP", arg=1),
-                    create_instruction("CALL_FUNCTION_EX", arg=1),
-                ]
-            )
-        else:
-            codegen.append_output(create_instruction("CALL_FUNCTION_EX", arg=0))
+        codegen.extend_output(
+            [
+                codegen.create_load_const("strict"),
+                codegen.create_load_const(self.strict),
+                create_instruction("BUILD_MAP", arg=1),
+                create_instruction("CALL_FUNCTION_EX", arg=1),
+            ]
+        )
 
 
 class MapVariable(ZipVariable):
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 9bef1aecc342c..513a73afec4d0 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -1230,6 +1230,26 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
             codegen.extend_output(create_rot_n(2))
             codegen.store_attr(name)
 
+    def _is_method_overridden(self, method_name: str) -> bool:
+        """Checks if a method is overridden in the NamedTuple subclass.
+
+        Args:
+            method_name (str): The name of the method to check.
+
+        Returns:
+            bool: True if the method is overridden in the subclass, False otherwise.
+
+        Raises:
+            ValueError: If the NamedTuple class does not inherit from both Tuple and Object.
+        """
+        if len(self.tuple_cls.__mro__) < 3:
+            raise ValueError("NamedTuple should inherit from Tuple and Object.")
+        if getattr(self.tuple_cls, method_name, None) == getattr(
+            self.tuple_cls.__mro__[-3], method_name, None
+        ):
+            return False
+        return True
+
     def call_method(
         self,
         tx,
@@ -1257,8 +1277,55 @@ def call_method(
                 tx.output.side_effects.store_attr(self, attr, value)
             self.dynamic_attributes[attr] = value
             return ConstantVariable.create(None)
+        elif name == "_replace":
+            # NamedTuple._replace should create a new instance with replaced fields
+            if args:
+                raise_observed_exception(
+                    TypeError,
+                    tx,
+                    args=[
+                        ConstantVariable.create(
+                            "_replace() takes no positional arguments"
+                        )
+                    ],
+                )
+
+            # Get the field names for validation
+            fields = self.fields()
+
+            # Start with current items (copy them)
+            new_items = list(self.items)
+
+            # Replace fields specified in kwargs
+            for field_name, new_value in kwargs.items():
+                if field_name not in fields:
+                    raise_observed_exception(
+                        ValueError,
+                        tx,
+                        args=[
+                            ConstantVariable.create(
+                                f"Got unexpected field name: '{field_name}'"
+                            )
+                        ],
+                    )
+
+                # Replace the item at the field's index
+                field_index = fields.index(field_name)
+                new_items[field_index] = new_value
+
+            return NamedTupleVariable(new_items, self.tuple_cls)
+
         return super().call_method(tx, name, args, kwargs)
 
+    def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
+        if isinstance(arg, SliceVariable):
+            # slicing a namedtuple produces a tuple
+            return TupleVariable(
+                self.items[arg.as_python_constant()],
+                source=None,
+            )
+        return super().getitem_const(tx, arg)
+
     def var_getattr(self, tx: "InstructionTranslator", name):
         def check_and_create_method():
             method = inspect.getattr_static(self.tuple_cls, name, None)
@@ -1275,6 +1342,23 @@ def check_and_create_method():
             else:
                 return None
 
+        # Avoid UserMethodVariable fallback precisely when methods NamedTuple methods have not been overwritten.
+        if (
+            name == "_replace"
+            and not self._is_method_overridden("_replace")
+            and not self._is_method_overridden("__getattr__")
+        ):
+            # Return a BuiltinVariable for the _replace method
+            # Get the actual _replace method from the tuple class
+            actual_replace_method = getattr(self.tuple_cls, "_replace", None)
+            if actual_replace_method:
+                from ..source import AttrSource
+
+                source = AttrSource(self.source, name) if self.source else None
+                return variables.GetAttrVariable(self, name, source=source)
+            # Fallback if _replace doesn't exist (shouldn't happen for proper NamedTuples)
+            return super().var_getattr(tx, name)
+
         if name == "_fields":
             source = NamedTupleFieldsSource(self.source) if self.source else None
             return VariableTracker.build(tx, self.fields(), source=source)
diff --git a/torch/_dynamo/variables/optimizer.py b/torch/_dynamo/variables/optimizer.py
index 499c956843beb..776f7f34d9c37 100644
--- a/torch/_dynamo/variables/optimizer.py
+++ b/torch/_dynamo/variables/optimizer.py
@@ -147,7 +147,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
 
             for group in self.value.param_groups:
                 for p in group["params"]:
-                    mark_static_address(p)
+                    mark_static_address(p, guard=True)
 
             self._set_capturable(tx)
 
@@ -240,7 +240,7 @@ def map_sources_and_install_guards(self, tx):
         self.tensor_to_source = {}
 
         def mark_static(x):
-            mark_static_address(x)
+            mark_static_address(x, guard=True)
 
         tree_map_only(torch.Tensor, mark_static, self.value.state)
 
@@ -348,14 +348,14 @@ def wrap_tensor(self, tx: "InstructionTranslator", tensor_value):
 
         if tensor_value in self.tensor_to_source:
             # mark these tensors as static for cudagraphs
-            mark_static_address(tensor_value)
+            mark_static_address(tensor_value, guard=True)
             source = self.tensor_to_source[tensor_value]
             self.static_tensor_names.add(tx.output.module_key_name(source.name()))
         elif tensor_value in self.grad_to_source:
             source = self.grad_to_source[tensor_value]
         else:
             # mark these tensors as static for cudagraphs
-            mark_static_address(tensor_value)
+            mark_static_address(tensor_value, guard=True)
 
             global_name = tx.store_global_weakref_by_id(GLOBAL_KEY_PREFIX, tensor_value)
             source = GlobalWeakRefSource(global_name)
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 08dab47451abf..5dea5cd35d174 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -999,7 +999,11 @@ def method_data_ptr(self, *args, **kwargs):
         return DataPtrVariable(self)
 
     def method_item(self, *args, **kwargs):
-        if not config.capture_scalar_outputs:
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        # We enable capture_scalar_outputs when full_graph=True by default.
+        if not tx.one_graph and not config.capture_scalar_outputs:
             self._warn_capture_scalar_outputs()
             unimplemented_v2(
                 gb_type="Unsupported Tensor.item() call with capture_scalar_outputs=False",
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
index 02e8c118cfafb..634cbee2b9ac6 100644
--- a/torch/_export/__init__.py
+++ b/torch/_export/__init__.py
@@ -47,6 +47,7 @@
 
 from .wrappers import _wrap_submodules
 from .utils import _materialize_cpp_cia_ops
+from . import config
 
 if TYPE_CHECKING:
     from torch._C._aoti import AOTIModelContainerRunner
@@ -65,7 +66,6 @@ class ExportDynamoConfig:
 # is called multiple times.
 @lru_cache
 def aot_compile_warning():
-    from torch._inductor import config
 
     log.warning("+============================+")
     log.warning("|     !!!   WARNING   !!!    |")
@@ -124,11 +124,11 @@ def aot_compile(
     """
     from torch.export._trace import _export_to_torch_ir
     from torch._inductor.decomposition import select_decomp_table
-    from torch._inductor import config
+    from torch._inductor import config as inductor_config
 
     aot_compile_warning()
 
-    if config.is_predispatch:
+    if inductor_config.is_predispatch:
         gm = torch.export._trace._export(f, args, kwargs, dynamic_shapes, pre_dispatch=True).module()
     else:
         # We want to export to Torch IR here to utilize the pre_grad passes in
diff --git a/torch/_export/config.py b/torch/_export/config.py
new file mode 100644
index 0000000000000..5e0befead3933
--- /dev/null
+++ b/torch/_export/config.py
@@ -0,0 +1,31 @@
+"""
+Configuration module for torch.export.export.
+
+This module contains various configuration flags and settings that control torch.export's
+behavior, including:
+- Runtime behavior flags
+- Debugging and development options
+"""
+
+import sys
+from typing import Any, TYPE_CHECKING
+
+from torch.utils._config_module import install_config_module
+
+
+# this flag controls whether we use new functional tracer. It
+# should be True in the long term.
+use_new_tracer_experimental = False
+
+# this flag is used to control whether we want to instrument
+# fake tensor creation to track potential leaks. It is off
+# by default, but user can turn it on to debug leaks.
+detect_non_strict_fake_tensor_leaks = False
+
+if TYPE_CHECKING:
+    from torch.utils._config_typing import *  # noqa: F401, F403
+
+    def _make_closure_patcher(**changes: Any) -> Any: ...
+
+
+install_config_module(sys.modules[__name__])
diff --git a/torch/_export/db/examples/optional_input.py b/torch/_export/db/examples/optional_input.py
index 9693aa476f0eb..41e66a7c977a8 100644
--- a/torch/_export/db/examples/optional_input.py
+++ b/torch/_export/db/examples/optional_input.py
@@ -16,5 +16,5 @@ def forward(self, x, y=torch.randn(2, 3)):
 
 example_args = (torch.randn(2, 3),)
 tags = {"python.object-model"}
-support_level = SupportLevel.NOT_SUPPORTED_YET
+support_level = SupportLevel.SUPPORTED
 model = OptionalInput()
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index 07674b5702947..8cc139f7cf3d3 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -510,6 +510,59 @@ def __new__(metacls, name, bases, classdict):
         return type.__new__(metacls, name, bases, dict(classdict))
 
 
+def get_triton_kernel_and_cache_entry(node: torch.fx.Node):
+    assert (
+        node.target
+        is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
+    )
+
+    assert has_triton(), "triton required to serialize triton kernels"
+    from triton.runtime.autotuner import Autotuner
+
+    assert isinstance(node.kwargs["kernel_idx"], int)
+    kernel = torch._higher_order_ops.triton_kernel_wrap.kernel_side_table.get_kernel(
+        node.kwargs["kernel_idx"]
+    )
+
+    kNumWarpsDefault = 4
+
+    # currently we only support specialization of
+    # num_warps -- so search for the entry that
+    # matches the value from the associated kernel
+    if isinstance(kernel, Autotuner):
+        assert len(kernel.configs) == 1
+        num_warps = kernel.configs[0].num_warps
+        assert kernel.configs[0].num_ctas == 1, (
+            "serialization only supports num_ctas == 1"
+        )
+        kernel = kernel.fn
+    else:
+        num_warps = kNumWarpsDefault
+
+    if hasattr(kernel, "device_caches"):
+        caches = kernel.device_caches
+        assert len(caches.keys()) == 1
+        cache = next(iter(caches.values()))[0]
+    elif hasattr(kernel, "cache"):
+        # old path, still used for cpu triton builds
+        caches = kernel.cache
+        assert len(caches.keys()) == 1
+        cache = next(iter(caches.values()))
+    else:
+        raise AssertionError(f"kernel caches not found for kernel {kernel.__name__}")
+
+    # can also get num_warps, num_ctas, etc. from here ig
+    if len(cache.keys()) == 1:
+        return kernel, next(iter(cache.values()))
+    else:
+        for cache_entry in cache.values():
+            if cache_entry.metadata.num_warps == num_warps:
+                return kernel, cache_entry
+        raise AssertionError(
+            f"couldn't find a kernel cache entry with metadata matching the autotuner configs for kernel {kernel.__name__}"
+        )
+
+
 @final
 class GraphModuleSerializer(metaclass=Final):
     def __init__(
@@ -676,8 +729,8 @@ def serialize_tensor_list_output(node):
                 node.target
                 is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
             ):
-                assert has_triton(), "triton required to serialize triton kernels"
-                from triton.runtime.autotuner import Autotuner
+                kernel, kernel_cache_entry = get_triton_kernel_and_cache_entry(node)
+                kernel_cache_metadata = kernel_cache_entry.metadata
 
                 meta_val = node.meta["val"]
                 assert isinstance(meta_val, dict)
@@ -685,21 +738,6 @@ def serialize_tensor_list_output(node):
                 output_keys = meta_val.keys()
                 output_indices = []
 
-                assert isinstance(node.kwargs["kernel_idx"], int)
-                kernel = torch._higher_order_ops.triton_kernel_wrap.kernel_side_table.get_kernel(
-                    node.kwargs["kernel_idx"]
-                )
-
-                if isinstance(kernel, Autotuner):
-                    assert len(kernel.configs) == 1
-                    num_warps = kernel.configs[0].num_warps
-                    assert kernel.configs[0].num_ctas == 1, (
-                        "serialization only supports num_ctas == 1"
-                    )
-                    kernel = kernel.fn
-                else:
-                    num_warps = 4
-
                 constexpr_keys = set()
                 for p in kernel.params:
                     if p.is_constexpr:
@@ -732,9 +770,12 @@ def serialize_tensor_list_output(node):
                     "name": kernel.fn.__name__,
                     "grid": node.kwargs["grid"][0],
                     "output_indices": output_indices,
-                    "num_warps": num_warps,
+                    "num_warps": kernel_cache_metadata.num_warps,
                 }
 
+                if hasattr(kernel_cache_metadata, "shared"):
+                    kwargs_new["shared_memory_bytes"] = kernel_cache_metadata.shared
+
                 ex_node = Node(
                     target=self.serialize_operator(node.target),
                     inputs=self.serialize_hoo_inputs(args_new, kwargs_new),
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index b7807145a9fa8..06c608d20c7f5 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -488,6 +488,7 @@ def register_dataclass_as_pytree_node(
         f"Only dataclasses can be registered with this function: {cls}"
     )
 
+    @torch._dynamo.dont_skip_tracing
     def default_flatten_fn(obj: Any) -> tuple[list[Any], Context]:
         flattened = []
         flat_names = []
@@ -501,10 +502,12 @@ def default_flatten_fn(obj: Any) -> tuple[list[Any], Context]:
                 none_names.append(name)
         return flattened, [flat_names, none_names]
 
+    @torch._dynamo.dont_skip_tracing
     def default_unflatten_fn(values: Iterable[Any], context: Context) -> Any:
         flat_names, none_names = context
         return cls(**dict(zip(flat_names, values)), **dict.fromkeys(none_names))
 
+    @torch._dynamo.dont_skip_tracing
     def default_flatten_fn_with_keys(obj: Any) -> tuple[list[Any], Context]:
         flattened, (flat_names, _none_names) = flatten_fn(obj)  # type: ignore[misc]
         return [(MappingKey(k), v) for k, v in zip(flat_names, flattened)], flat_names
diff --git a/torch/_export/verifier.py b/torch/_export/verifier.py
index 58c0f1771a1ee..28593291b22cc 100644
--- a/torch/_export/verifier.py
+++ b/torch/_export/verifier.py
@@ -216,6 +216,7 @@ def _allowed_op_types() -> tuple[type[Any], ...]:
                 torch.sym_not,
                 torch.sym_sqrt,
                 torch.sym_sum,
+                torch.export.custom_ops._call_custom_autograd_function_in_pre_dispatch,
                 # TODO (tmanlaibaatar)
                 # Predispatch export is able to contain autograd ops.
                 # These will be modeled as HOO later
diff --git a/torch/_export/wrappers.py b/torch/_export/wrappers.py
index b851847bada81..e023169403937 100644
--- a/torch/_export/wrappers.py
+++ b/torch/_export/wrappers.py
@@ -1,5 +1,7 @@
 # mypy: allow-untyped-defs
+import inspect
 from contextlib import contextmanager
+from functools import wraps
 
 import torch
 import torch._custom_ops
@@ -15,7 +17,6 @@
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
-    get_proxy_slot,
     PreDispatchTorchFunctionMode,
     ProxyTorchDispatchMode,
     track_tensor_tree,
@@ -129,7 +130,7 @@ def call(self, *args):
     return cls
 
 
-def _register_subclass_spec_proxy_in_tracer(tracer, name, spec):
+def _register_func_spec_proxy_in_tracer(tracer, name, spec):
     """
     This is a wrapper utility method on top of tracer to cache the
     already registered subclass spec attribute. This is useful because
@@ -146,6 +147,41 @@ def _register_subclass_spec_proxy_in_tracer(tracer, name, spec):
     return tracer.create_proxy("get_attr", qualname, (), {})
 
 
+def _emit_flat_apply_call(
+    *,
+    tracer,
+    spec_name: str,
+    const_target_for_apply,
+    graphable_args,
+    track_value,
+    call_spec_cache_key: str,
+):
+    # Flatten to graphable form and record the spec on the FX root
+    flat_args, in_spec = to_graphable(graphable_args)
+    qualname = tracer.get_fresh_qualname(spec_name)  # type: ignore[union-attr]
+    setattr(tracer.root, qualname, in_spec)  # type: ignore[union-attr]
+    spec_proxy = tracer.create_proxy("get_attr", qualname, (), {})
+
+    # Reuse/cached ConstantFunction spec on the root
+    _, func_spec = pytree.tree_flatten(_ConstantFunction(const_target_for_apply))
+    func_spec_proxy = _register_func_spec_proxy_in_tracer(
+        tracer, f"{call_spec_cache_key}_const_func_spec", func_spec
+    )
+
+    # Map runtime args -> proxies (always via tracer.unwrap_proxy now)
+    flat_proxy_args = pytree.tree_map(tracer.unwrap_proxy, flat_args)
+
+    # Emit flat_apply and track result structure
+    out_proxy = tracer.create_proxy(
+        "call_function", flat_apply, (func_spec_proxy, spec_proxy, *flat_proxy_args), {}
+    )
+    track_tensor_tree(track_value, out_proxy, constant=None, tracer=tracer)
+
+
+def _is_init(fn):
+    return callable(fn) and fn.__name__ == "__init__"
+
+
 def mark_subclass_constructor_exportable_experimental(constructor_subclass):
     """
     Experimental decorator that makes subclass to be traceable in export
@@ -167,10 +203,6 @@ def __new__(cls, elem, *, requires_grad=False):
         def __init__(self, elem, ...):
             # ...
     """
-
-    def _is_init(fn):
-        return callable(fn) and fn.__name__ == "__init__"
-
     if not _is_init(constructor_subclass):
         raise RuntimeError(
             f"torch._export.wrappers.mark_constructor_exportable_experimental can only be applied on subclass tensor.__init__"
@@ -179,14 +211,18 @@ def _is_init(fn):
         )
 
     def wrapper(*args, **kwargs):
+        constructor_subclass(*args, **kwargs)
+
+        if not torch.compiler.is_exporting():
+            return
+
         if not is_traceable_wrapper_subclass_type(type(args[0])):
             assert constructor_subclass.__qualname__.endswith("__init__")
             obj_name = constructor_subclass.__qualname__[: -len("__init__")]
             raise RuntimeError(
-                f"Applying mark_constructor_exportable_experimental on {obj_name} is not valid as it is not a traceable "
+                f"Can't intercept {obj_name} in export because this object is not a traceable "
                 f"tensor subclass. Please look at DTensor.__init__ implementation as an example of proper usage of this API."
             )
-        constructor_subclass(*args, **kwargs)
 
         mode = _maybe_find_pre_dispatch_tf_mode_for_export()
         if mode is None:
@@ -196,46 +232,106 @@ def wrapper(*args, **kwargs):
 
         tracer = mode.tracer
         subclass = args[0]
+        graphable = (tuple(args[1:]), kwargs)
+
+        spec_name = "_".join(constructor_subclass.__qualname__.lower().split("."))
+        call_spec_cache_key = type(subclass).__name__.lower()
+
+        _emit_flat_apply_call(
+            tracer=tracer,
+            spec_name=spec_name,
+            const_target_for_apply=type(subclass),
+            graphable_args=graphable,
+            track_value=subclass,  # track the constructed subclass instance
+            call_spec_cache_key=call_spec_cache_key,
+        )
+        return
 
-        flat_args, in_spec = to_graphable((tuple(args[1:]), kwargs))
+    return wrapper
 
-        constructor_spec_name = "_".join(
-            constructor_subclass.__qualname__.lower().split(".")
-        )
-        qualname = tracer.get_fresh_qualname(constructor_spec_name)  # type: ignore[union-attr]
-        setattr(tracer.root, qualname, in_spec)  # type: ignore[union-attr]
-        spec_proxy = tracer.create_proxy("get_attr", qualname, (), {})
-        flat_proxy_args = pytree.tree_map_only(
-            torch.Tensor, lambda x: get_proxy_slot(x, tracer).proxy, flat_args
-        )
 
-        _, func_spec = torch.utils._pytree.tree_flatten(
-            _ConstantFunction(type(subclass))
+def allow_in_pre_dispatch_graph(func):
+    """
+    Experimental decorator that adds user function to export pre-dispatch graph. Note that
+    we only support custom autograd function/subclass constructors today. To use this function:
+        1. For subclasses:
+            1. refer to instructions in mark_subclass_constructor_exportable_experimental
+        2. Define apply method on your custom autograd function and apply this decorator.
+
+    Example:
+
+    class MyCoolCustomAutogradFunc(autograd.Function):
+        @classmethod
+        @torch._export.wrappers.allow_in_pre_dispatch_graph
+        def apply(cls, *args, **kwargs):
+            return super(MyCoolCustomAutogradFunc, cls).apply(*args, **kwargs)
+
+    """
+    if _is_init(func):
+        return mark_subclass_constructor_exportable_experimental(func)
+
+    if not (_is_init(func) or func.__name__ == "apply"):
+        raise RuntimeError(
+            f"torch._export.wrappers.allow_in_pre_dispatch_graph can only be applied on subclass tensor.__init_ "
+            f"or custom_autograd_function.apply. "
+            f"But, you are adding it on {func.__name__} which is not supported. "
+            f"If __init__ doesn't exist on your subclass, please add it. Look at DTensor.__init__ implementation for example. "
+            f"If you are adding it on custom autograd function, please add it on apply method. "
+            f"If anything else, file an issue on github and we may consider extending our support. "
         )
 
-        # We actually don't want to create a new spec for each instance
-        # In fx graph, it will look like dtensor_const_func_spec
-        # We can't directly shove DTensor.__init__ into fx as it is not
-        # allowed type.
-        fxable_constructor_call_spec_name = (
-            type(subclass).__name__.lower() + "_const_func_spec"
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not torch.compiler.is_exporting():
+            return func(*args, **kwargs)
+
+        if not inspect.isclass(args[0]):
+            return func(*args, **kwargs)
+
+        if not issubclass(args[0], torch.autograd.Function):
+            return func(*args, **kwargs)
+
+        from torch._ops import _get_dispatch_mode_pre_dispatch
+
+        mode = _get_dispatch_mode_pre_dispatch(torch._C._TorchDispatchModeKey.PROXY)
+        if mode is None:
+            return func(*args, **kwargs)
+
+        # Sometimes custom autograd functions can call into HOPs that don't have proxy impl
+        # at PreDispatch level, so we just dispatch it below to get the concrete result.
+        include_to_set = torch._C._dispatch_tls_local_include_set().remove(
+            torch._C.DispatchKey.PreDispatch
+        )
+        exclude_to_set = (
+            torch._C._dispatch_tls_local_exclude_set()
+            | torch._C.DispatchKeySet(torch._C.DispatchKey.PreDispatch)
         )
 
-        # We should try to reuse the constructor call spec as it is guaranteed to be same
-        # for each subclass type. This is different from proxy-ing the init arguments which
-        # can't be reused because for example, DTensor can receive different DeviceMesh etc
-        # as it's arguments
-        func_spec_proxy = _register_subclass_spec_proxy_in_tracer(
-            tracer, fxable_constructor_call_spec_name, func_spec
+        with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set):
+            out = func(*args, **kwargs)
+
+        assert mode.pre_dispatch, "Should only do this in predispatch"
+        tracer = mode.tracer
+
+        function_cls_name = f"{args[0].__module__}.{args[0].__qualname__}"
+        graphable = ((function_cls_name, *args[1:]), kwargs)
+
+        from torch.export.custom_ops import (
+            _call_custom_autograd_function_in_pre_dispatch,
         )
 
-        inner_proxy = tracer.create_proxy(
-            "call_function",
-            flat_apply,
-            (func_spec_proxy, spec_proxy, *flat_proxy_args),
-            {},
+        spec_name = "_".join(function_cls_name.split("."))
+        call_spec_cache_key = type(
+            _call_custom_autograd_function_in_pre_dispatch
+        ).__name__.lower()
+        _emit_flat_apply_call(
+            tracer=tracer,
+            spec_name=spec_name,
+            const_target_for_apply=_call_custom_autograd_function_in_pre_dispatch,
+            graphable_args=graphable,
+            track_value=out,
+            call_spec_cache_key=call_spec_cache_key,
         )
-        track_tensor_tree(subclass, inner_proxy, constant=None, tracer=tracer)
-        return
+        return out
 
     return wrapper
diff --git a/torch/_functorch/_activation_checkpointing/ac_logging_utils.py b/torch/_functorch/_activation_checkpointing/ac_logging_utils.py
index fe22a38379597..b629d43ef3b5d 100644
--- a/torch/_functorch/_activation_checkpointing/ac_logging_utils.py
+++ b/torch/_functorch/_activation_checkpointing/ac_logging_utils.py
@@ -60,10 +60,32 @@ def create_activation_checkpointing_logging_structure_payload(
     expected_runtime: float,
     saved_node_idxs: list[int],
     recomputable_node_idxs: list[int],
-    memories_banned_nodes: list[float],
+    memories_banned_nodes: list[int],
+    normalized_memories_banned_nodes: list[float],
     runtimes_banned_nodes: list[float],
     min_cut_saved_values: list[Node],
 ) -> dict[str, Any]:
+    """
+    Creates a structured payload for logging activation checkpointing information.
+
+    Args:
+        joint_graph: The computational graph representing operations.
+        joint_graph_node_information: Dictionary containing information about nodes in the joint graph.
+        joint_graph_edges: List of edges in the joint graph represented as tuples of node names.
+        all_recomputable_banned_nodes: List of nodes that are banned from recomputation.
+        expected_runtime: Expected runtime of the computation.
+        saved_node_idxs: Indices of nodes that are saved (not recomputed).
+        recomputable_node_idxs: Indices of nodes that can be recomputed.
+        memories_banned_nodes: Memory usage values (in absolute units) for banned nodes.
+        normalized_memories_banned_nodes: Normalized memory usage values for banned nodes,
+            used as input to the knapsack algorithm.
+        runtimes_banned_nodes: Runtime values for banned nodes, used as input to the
+            knapsack algorithm.
+        min_cut_saved_values: List of nodes saved by the min-cut algorithm.
+
+    Returns:
+        A dictionary containing structured logging information for activation checkpointing.
+    """
     activation_checkpointing_logging_structure_payload: dict[str, Any] = {
         "Joint Graph Size": len(joint_graph.nodes),
         "Joint Graph Edges": {
@@ -77,7 +99,8 @@ def create_activation_checkpointing_logging_structure_payload(
         "Expected Runtime": expected_runtime,
         "Knapsack Saved Nodes": saved_node_idxs,
         "Knapsack Recomputed Nodes": recomputable_node_idxs,
-        "Knapsack Input Memories": memories_banned_nodes,
+        "Absolute Memories": memories_banned_nodes,
+        "Knapsack Input Memories": normalized_memories_banned_nodes,
         "Knapsack Input Runtimes": runtimes_banned_nodes,
         "Min Cut Solution Saved Values": [node.name for node in min_cut_saved_values],
     }
@@ -90,17 +113,37 @@ def create_structured_trace_for_min_cut_info(
     saved_node_idxs: list[int],
     recomputable_node_idxs: list[int],
     expected_runtime: float,
-    memories_banned_nodes: list[float],
+    memories_banned_nodes: list[int],
+    normalized_memories_banned_nodes: list[float],
     runtimes_banned_nodes: list[float],
     min_cut_saved_values: list[Node],
 ) -> None:
+    """
+    Creates a structured trace for minimum cut information in the graph.
+
+    Args:
+        joint_graph: The computational graph representation.
+        all_recomputable_banned_nodes: List of nodes that can be recomputed.
+        saved_node_idxs: Indices of nodes that are saved in memory.
+        recomputable_node_idxs: Indices of nodes that are recomputed.
+        expected_runtime: Expected runtime for the computation.
+        memories_banned_nodes: Memory requirements for each banned node in bytes.
+        normalized_memories_banned_nodes: Normalized memory requirements for each banned node
+            (typically scaled between 0 and 1 for relative comparison).
+        runtimes_banned_nodes: Runtime costs associated with each banned node.
+        min_cut_saved_values: Nodes that are saved as part of the minimum cut solution.
+    """
+    # Create a dictionary to store recomputable node information
     recomputable_node_info: dict[str, int] = {
         node.name: idx for idx, node in enumerate(all_recomputable_banned_nodes)
     }
+
+    # Create joint graph node information
     joint_graph_node_information = create_joint_graph_node_information(
         joint_graph, recomputable_node_info
     )
 
+    # Update node information with recomputable candidate details
     for node_name, node_info in joint_graph_node_information.items():
         if node_info["is_recomputable_candidate"]:
             idx = recomputable_node_info[node_name]
@@ -117,28 +160,30 @@ def create_structured_trace_for_min_cut_info(
                 idx in recomputable_node_idxs
             )
 
+    # Create joint graph edges
     joint_graph_edges = create_joint_graph_edges(joint_graph)
+
+    # Create activation checkpointing logging structure payload
     activation_checkpointing_logging_structure_payload = (
         create_activation_checkpointing_logging_structure_payload(
-            joint_graph,
-            joint_graph_node_information,
-            joint_graph_edges,
-            all_recomputable_banned_nodes,
-            expected_runtime,
-            saved_node_idxs,
-            recomputable_node_idxs,
-            memories_banned_nodes,
-            runtimes_banned_nodes,
-            min_cut_saved_values,
+            joint_graph=joint_graph,
+            joint_graph_node_information=joint_graph_node_information,
+            joint_graph_edges=joint_graph_edges,
+            all_recomputable_banned_nodes=all_recomputable_banned_nodes,
+            expected_runtime=expected_runtime,
+            saved_node_idxs=saved_node_idxs,
+            recomputable_node_idxs=recomputable_node_idxs,
+            memories_banned_nodes=memories_banned_nodes,
+            normalized_memories_banned_nodes=normalized_memories_banned_nodes,
+            runtimes_banned_nodes=runtimes_banned_nodes,
+            min_cut_saved_values=min_cut_saved_values,
         )
     )
 
+    # Create structured trace
     trace_structured(
         "artifact",
-        metadata_fn=lambda: {
-            "name": "min_cut_information",
-            "encoding": "json",
-        },
+        metadata_fn=lambda: {"name": "min_cut_information", "encoding": "json"},
         payload_fn=lambda: json.dumps(
             activation_checkpointing_logging_structure_payload
         ),
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index 248c3a0ae673e..4d370766aaf9f 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -22,7 +22,7 @@
 from typing_extensions import override
 
 import torch
-from torch._dynamo.precompile_context import PrecompileCacheArtifact, PrecompileContext
+from torch._dynamo.precompile_context import BackendCacheArtifact, PrecompileContext
 from torch._dynamo.trace_rules import torch_non_c_binding_in_graph_functions
 from torch._dynamo.utils import (
     chromium_event_log_active,
@@ -51,7 +51,7 @@
     OutputCode,
 )
 from torch._inductor.runtime.runtime_utils import cache_dir
-from torch._inductor.utils import should_use_remote_fx_graph_cache
+from torch._inductor.utils import BoxedBool, should_use_remote_fx_graph_cache
 from torch._logging import LazyString
 from torch._utils_internal import log_cache_bypass
 from torch.compiler._cache import (
@@ -71,15 +71,16 @@
     FunctionalizedRngRuntimeWrapper,
     post_compile,
     RuntimeWrapper,
+    SerializableCompiledFunction,
     SubclassMeta,
 )
 from .schemas import AOTAutogradCacheInfo, AOTConfig, ViewAndMutationMeta  # noqa: F401
+from .utils import simple_wraps
 
 
 if TYPE_CHECKING:
     from torch._inductor.compile_fx import _CompileFxKwargs
     from torch._inductor.remote_cache import JsonDataTy, RemoteCache
-    from torch._inductor.utils import BoxedBool
     from torch.fx.node import Node
 
 log = logging.getLogger(__name__)
@@ -306,6 +307,8 @@ def get_triton_source_codes_from_gm(
         self,
         gm: torch.fx.GraphModule,
     ):
+        assert has_triton_package(), "Triton is not available"
+
         triton_kernels = []
         for module in gm.modules():
             if not isinstance(module, torch.fx.GraphModule):
@@ -331,6 +334,11 @@ def get_triton_source_codes_from_gm(
         )
 
         for kernel in triton_kernels:
+            from triton.runtime.autotuner import Autotuner
+
+            if isinstance(kernel, Autotuner):
+                # Grab the Inner JITFunction
+                kernel = kernel.fn
             source_codes = user_defined_triton_kernel_transitive_closure_source_code(
                 kernel
             )
@@ -355,7 +363,8 @@ def __init__(
             [],
             [],
         )
-        self.triton_kernel_source_codes = self.get_triton_source_codes_from_gm(gm)
+        if has_triton_package():
+            self.triton_kernel_source_codes = self.get_triton_source_codes_from_gm(gm)
 
         if hasattr(gm, "saved_tensors_hooks_pack_0"):
 
@@ -955,6 +964,7 @@ def wrap_post_compile(
                 fw_metadata=self.runtime_metadata,
                 try_save_cache_entry=None,
             )
+
         else:
             compiled_function = RuntimeWrapper(
                 indices_of_inps_to_detach=self.indices_of_inps_to_detach,
@@ -964,6 +974,11 @@ def wrap_post_compile(
                 compiled_fw_func, aot_config, runtime_metadata=self.runtime_metadata
             )
 
+        # Add serialization function back onto object
+        compiled_function = SerializableCompiledFunction(
+            compiled_function, lambda: self
+        )
+
         compiled_function, _ = post_compile(
             self.dispatch_wrappers,
             compiled_function,
@@ -1043,39 +1058,40 @@ def type():
         return "aot_autograd"
 
 
-@CacheArtifactFactory.register
-class BundledAOTAutogradCacheArtifact(PrecompileCacheArtifact[Callable]):
-    @override
-    @staticmethod
-    def type():
-        return "precompile_aot_autograd"
+def deserialize_bundled_cache_entry(entry: BundledAOTAutogradCacheEntry) -> Callable:
+    # In the precompile use case, guards are already serialized
+    # by dynamo, so we don't need to add them to the environment
+    entry.guards_expr = None
+    # TODO: this isn't exactly right, because cudagraphs needs to be a shared config
+    # which is set by compile_fx. But in precompile, we never actually call compile_fx
+    # so we don't have a place to track cudagraphs here.
+    cudagraphs = BoxedBool(torch._inductor.config.triton.cudagraphs)
+    boxed_forward_device_index = BoxedDeviceIndex(None)
+    compiled_fn = entry.wrap_post_compile(
+        [],
+        entry.sanitized_aot_config,
+        {
+            "cudagraphs": cudagraphs,
+            "boxed_forward_device_index": boxed_forward_device_index,
+        },
+    )
 
-    @override
-    def after_deserialization(self) -> Callable:
-        entry = pickle.loads(self.content)
-        # In the precompile use case, guards are already serialized
-        # by dynamo, so we don't need to add them to the environment
-        entry.guards_expr = None
-        # TODO: this isn't exactly right, because cudagraphs needs to be a shared config
-        # which is set by compile_fx. But in precompile, we never actually call compile_fx
-        # so we don't have a place to track cudagraphs here.
-        cudagraphs = torch._inductor.config.triton.cudagraphs
-        boxed_forward_device_index = BoxedDeviceIndex(None)
-        compiled_fn = entry.wrap_post_compile(
-            [],
-            entry.sanitized_aot_config,
-            {
-                "cudagraphs": cudagraphs,
-                "boxed_forward_device_index": boxed_forward_device_index,
-            },
-        )
+    # TODO: this ignores flat_params, which can exist
+    # if inline_builtin_nn_modules=False
+    @simple_wraps(compiled_fn)
+    def forward(*runtime_args: tuple[Any]):
+        return compiled_fn(list(runtime_args))
 
-        # TODO: this ignores flat_params, which can exist
-        # if inline_builtin_nn_modules=False
-        def forward(*runtime_args: tuple[Any]):
-            return compiled_fn(list(runtime_args))
+    assert hasattr(compiled_fn, "serialize")
+    forward.serialize = compiled_fn.serialize  # type: ignore[attr-defined]
 
-        return forward
+    return forward
+
+
+@dataclass
+class BundledAOTAutogradCacheArtifact(BackendCacheArtifact[Callable]):
+    def after_deserialization(self) -> Callable:
+        return deserialize_bundled_cache_entry(self.content)
 
 
 class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
@@ -1351,9 +1367,9 @@ def _lookup(
                     # 1. because we set it to None on save 2. even if we didn't, this new run
                     # that cache hit has a *new* backend id associated with it.
                     PrecompileContext.record_artifact(
-                        BundledAOTAutogradCacheArtifact.type(),
-                        aot_config.precompile_backend_id,
-                        pickled_content,
+                        BundledAOTAutogradCacheArtifact(
+                            aot_config.precompile_backend_id, entry
+                        ),
                     )
         except Exception as e:
             log.info("AOTAutograd cache unable to load compiled graph: %s", e)
@@ -1389,15 +1405,11 @@ def save(key: str, entry: GenericAOTAutogradCacheEntry, remote: bool):
                 and entry.sanitized_aot_config.precompile_backend_id is not None
             ):
                 precompile_key = entry.sanitized_aot_config.precompile_backend_id
+                artifact = BundledAOTAutogradCacheArtifact(precompile_key, entry)
                 # Now that we're saving it, the precompile_backend_id field is no longer
                 # useful, remove it from the entry.
                 entry.sanitized_aot_config.precompile_backend_id = None
-                PrecompileContext.record_artifact(
-                    BundledAOTAutogradCacheArtifact.type(),
-                    precompile_key,
-                    entry,
-                    editable=True,
-                )
+                PrecompileContext.record_artifact(artifact)
             AOTAutogradCache._write_to_local_cache(key, content)
             counters["aot_autograd"]["autograd_cache_saved"] += 1
         except BypassAOTAutogradCache as e:
diff --git a/torch/_functorch/_aot_autograd/graph_capture_wrappers.py b/torch/_functorch/_aot_autograd/graph_capture_wrappers.py
index 0a2dc525cc070..b2d96620b4bc6 100644
--- a/torch/_functorch/_aot_autograd/graph_capture_wrappers.py
+++ b/torch/_functorch/_aot_autograd/graph_capture_wrappers.py
@@ -806,7 +806,7 @@ def _post_forward(primals):
                 # Here, we perform extra checks for primals that were mutated in the **backward**
                 # We're doing the checks here instead of doing them with the rest of the input mutation handling because:
                 # - We need to detect inputs that were mutated in the backward **separately** from mutations that happened
-                #   during the forward, because the handling is different: some input mutations from the the forward
+                #   during the forward, because the handling is different: some input mutations from the forward
                 #   can be only handled in a fw-only runtime epilogue, and in theory if we wanted to handle those same
                 #   types of mutations in the backward we would need a bw-only runtime epilogue.
                 # - We could in theory have our analysis pass differentiate mutations in the fw from mutations in
diff --git a/torch/_functorch/_aot_autograd/graph_compile.py b/torch/_functorch/_aot_autograd/graph_compile.py
index d02d29cba199b..2ae1263c3ae9a 100644
--- a/torch/_functorch/_aot_autograd/graph_compile.py
+++ b/torch/_functorch/_aot_autograd/graph_compile.py
@@ -51,6 +51,7 @@
 from .. import config
 from .autograd_cache import (
     AOTAutogradCache,
+    GenericAOTAutogradCacheEntry,
     serialize_graph_module,
     should_bundle_autograd_cache,
     should_use_remote_autograd_cache,
@@ -73,6 +74,7 @@
     post_compile,
     pre_compile,
     RuntimeWrapper,
+    SerializableCompiledFunction,
 )
 from .schemas import (
     AOTConfig,
@@ -363,6 +365,7 @@ def should_save_cache():
             AOTAutogradCache.save(
                 cache_info.cache_key, entry, remote=should_use_remote_autograd_cache()
             )
+            compiled_fw = SerializableCompiledFunction(compiled_fw, lambda: entry)
 
     compiled_fw = fakified_out_wrapper.post_compile(
         compiled_fw,
@@ -1315,7 +1318,8 @@ def _log_structured_logs():
 
 
 def aot_stage2_autograd(
-    aot_state: AOTState, aot_graph_capture: AOTGraphCapture
+    aot_state: AOTState,
+    aot_graph_capture: AOTGraphCapture,
 ) -> DispatchReturn:
     """
     Autograd logic. Generates a joint graph, partitions it, manipulates the input with various wrappers,
@@ -1832,6 +1836,7 @@ def aot_stage2_autograd(
     make_runtime_safe(fw_metadata, maybe_subclass_meta)
 
     try_save_cache_entry: Optional[Callable] = None
+    entry: Optional[GenericAOTAutogradCacheEntry] = None
 
     if aot_config.cache_info is not None:
         forward_time_taken_ns = time.time_ns() - aot_config.cache_info.start_time_ns
@@ -1844,7 +1849,7 @@ def try_save_cache_entry(  # noqa: F811
             bw_module: torch.fx.GraphModule,
             _fw_metadata: ViewAndMutationMeta,
             aot_config: AOTConfig,
-        ):
+        ) -> Optional[GenericAOTAutogradCacheEntry]:
             cache_info = aot_config.cache_info
 
             def should_save_cache():
@@ -1891,10 +1896,14 @@ def should_save_cache():
                 )
                 remote = should_use_remote_autograd_cache()
                 AOTAutogradCache.save(cache_info.cache_key, entry, remote)
+                return entry
+            return None
 
         if compiled_bw_func is not None:
             # If we already compiled the backward, we save its cache entry now
-            try_save_cache_entry(compiled_bw_func, bw_module, fw_metadata, aot_config)
+            entry = try_save_cache_entry(
+                compiled_bw_func, bw_module, fw_metadata, aot_config
+            )
             try_save_cache_entry = None
 
     compiled_fn = AOTDispatchAutograd.post_compile(
@@ -1911,6 +1920,9 @@ def should_save_cache():
         try_save_cache_entry=try_save_cache_entry,
     )
 
+    if entry is not None:
+        compiled_fn = SerializableCompiledFunction(compiled_fn, lambda: entry)
+
     if config.debug_assert:
         flat_requires_grad: list[Optional[bool]] = [
             a.requires_grad if isinstance(a, Tensor) else None for a in flat_args
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index f1cce86403209..5a5536913813c 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -11,6 +11,7 @@
 import collections
 import contextlib
 import copy
+import functools
 import itertools
 import pprint
 from contextlib import AbstractContextManager, nullcontext
@@ -307,6 +308,7 @@ def record_runtime_wrapper_prologue_exit(
         if cm is not None:
             cm.__exit__(None, None, None)
 
+    @simple_wraps(compiled_fn)
     def runtime_wrapper(args: list[Any]):
         # Create context manager for profiler
         cm = record_runtime_wrapper_prologue_enter()
@@ -465,6 +467,7 @@ def runtime_wrapper(args: list[Any]):
         return runtime_wrapper
 
     # Disabling saved tensors hooks
+    @simple_wraps(runtime_wrapper)
     def _runtime_wrapper(*args, **kwargs):
         with _disable_saved_tensors_hooks():
             return runtime_wrapper(*args, **kwargs)
@@ -1929,6 +1932,33 @@ def _disable_saved_tensors_hooks():
             )
 
 
+@dataclass
+class SerializableCompiledFunction:
+    """
+    Represents a result of AOTDispatch after calling the inner compiler
+    that can be serialized
+    """
+
+    compiled_fn: Callable
+    serialize_fn: Callable
+
+    def __init__(self, compiled_fn: Callable, serialize_fn: Callable):
+        self.compiled_fn = compiled_fn
+        self.serialize_fn = serialize_fn
+        # Equivalent to functools.wraps
+        functools.update_wrapper(
+            self,
+            compiled_fn,
+            assigned=("__doc__", "__annotations__", "__type_params__"),
+        )
+
+    def serialize(self) -> Any:
+        return self.serialize_fn()
+
+    def __call__(self, *args, **kwargs):
+        return self.compiled_fn(*args, **kwargs)
+
+
 # This is wrapped in a class just for namespacing purposes
 # No need to make it into an actual CompilerWrapper because it doesn't fit the abstract as cleanly
 class AOTDispatchAutograd:
@@ -2037,7 +2067,7 @@ def post_compile(
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,  # runtime metadata
-        try_save_cache_entry: Optional[Callable],  # Save cache entry after compilation
+        try_save_cache_entry: Optional[Callable],  # Serialization function
     ):
         # For additional context see Note [CUDA Graph Safe RNG Functionalization]
         # Each pair forward, backward rng states must be equal prior to its invocation on any
diff --git a/torch/_functorch/_aot_autograd/utils.py b/torch/_functorch/_aot_autograd/utils.py
index f028b63b3a8c7..8f6c7d1478e22 100644
--- a/torch/_functorch/_aot_autograd/utils.py
+++ b/torch/_functorch/_aot_autograd/utils.py
@@ -99,6 +99,7 @@ def _get_autocast_states():
 
 
 def make_boxed_func(f):
+    @simple_wraps(f)
     def g(args):
         return f(*args)
 
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 1e0cb6a2ef8be..2b0df0be370ab 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -106,6 +106,7 @@
 from ._aot_autograd.runtime_wrappers import (  # noqa: F401
     AOTDedupeWrapper,
     AOTSyntheticBaseWrapper,
+    SerializableCompiledFunction,
 )
 from ._aot_autograd.schemas import (  # noqa: F401
     AOTConfig,
@@ -1072,6 +1073,7 @@ def aot_module_simplified(
             boxed_forward_device_index,
             ignore_shape_env,
             flatten=False,
+            force_non_lazy_backward_lowering=config.force_non_lazy_backward_lowering,
         )
 
         compiled_fn = None
@@ -1110,6 +1112,7 @@ def aot_module_simplified(
         # the inputs so that they can be freed before the end of this scope.
         # For overhead reasons, this is not the default wrapper, see comment:
         # https://github.com/pytorch/pytorch/pull/122535/files#r1560096481
+        @simple_wraps(compiled_fn)
         def forward(runtime_args: list[Any]):
             flat_args = []
             flat_args.extend(params_buffers_flat)
@@ -1123,6 +1126,7 @@ def forward(runtime_args: list[Any]):
         # historically returned a function that was not the boxed calling
         # convention.  This should get fixed...
         # NB: GraphModule/nn.Module rely on the non-boxed calling convention here
+        @simple_wraps(compiled_fn)
         def forward(*runtime_args: tuple[Any]):
             full_args = []
             full_args.extend(params_buffers_flat)
@@ -1134,6 +1138,16 @@ def forward(*runtime_args: tuple[Any]):
     forward.named_parameters = mod.named_parameters
     forward.named_buffers = mod.named_buffers
 
+    # Add a serialize function
+    def grab_serialize_fn(fn):
+        if isinstance(fn, SerializableCompiledFunction):
+            return fn.serialize_fn
+        elif hasattr(fn, "__wrapped__"):
+            return grab_serialize_fn(fn.__wrapped__)
+        else:
+            return None
+
+    forward.serialize = grab_serialize_fn(forward)  # type: ignore[attr-defined]
     return forward
 
 
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index 5bf2dee3e1d7d..d53480e0e5113 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -296,6 +296,11 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # TODO: turn on by default
 graphsafe_rng_functionalization = True
 
+# Whether or not to eagerly compile the backward
+# used by AOT compile and other settings
+# TODO: once AOT compile calls aot autograd directly instead of
+# through compile_fx, we can remove this
+force_non_lazy_backward_lowering = False
 
 # Error on BypassAOTAutogradCache instead of just a warning
 # Used for tests
diff --git a/torch/_functorch/eager_transforms.py b/torch/_functorch/eager_transforms.py
index d99995b86f2ba..828f5e8decc6e 100644
--- a/torch/_functorch/eager_transforms.py
+++ b/torch/_functorch/eager_transforms.py
@@ -1593,7 +1593,7 @@ def forward(self, a_1):
           If you call `functionalize(f)` on a function that takes views / mutations of
           non-local state, functionalization will simply no-op and pass the view/mutation
           calls directly to the backend.
-          One way to work around this is is to ensure that any non-local state creation
+          One way to work around this is to ensure that any non-local state creation
           is wrapped into a larger function, which you then call functionalize on.
       (3) `resize_()` has some limitations: functionalize will only work on programs
           that use resize_()` as long as the tensor being resized is not a view.
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index 9030cfc3c17ca..7128ba6a9a363 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -9,6 +9,7 @@
 import operator
 import os
 import os.path
+import re
 from collections import defaultdict
 from dataclasses import dataclass, replace
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
@@ -341,6 +342,7 @@ def calculate_quantization_scaling(
     node: torch.fx.Node,
     max: float = 57344.0,
     min: float = 1e-12,
+    position: int = 0,
 ):
     with graph.inserting_after(node):
         abs_node = graph.call_function(
@@ -404,7 +406,7 @@ def calculate_quantization_scaling(
         scale_node = graph.call_function(
             torch.ops.prims.convert_element_type.default,
             args=(mul_node, torch.float32),
-            name="fp8_scale_" + str(node.name),
+            name=f"fp8_scale_pos_{position}_{node.name}",
         )
         scale_node.meta["val"] = torch.ops.prims.convert_element_type.default(
             mul_node.meta["val"], torch.float32
@@ -420,6 +422,7 @@ def perform_quantization(
     quant_type: torch.dtype,
     clamp_min: float,
     clamp_max: float,
+    position: int,
 ) -> torch.fx.Node:
     with graph.inserting_after(scale_node):
         target_node_32 = graph.call_function(
@@ -469,7 +472,7 @@ def perform_quantization(
         quant_activation_node = graph.call_function(
             torch.ops.prims.convert_element_type.default,
             args=(clamp_max_scaled_node, quant_type),
-            name="fp8_quant_" + str(node.name),
+            name=f"fp8_quant_pos_{position}_{node.name}",
         )
         quant_activation_node.meta["val"] = (
             torch.ops.prims.convert_element_type.default(
@@ -560,9 +563,9 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
     fwd_outputs = output.args[0]
     quant_type = get_quant_type()
     clamp_min, clamp_max = calculate_range(quant_type)
-    node_to_quant = dict()
+    position_to_quant = dict()
     tensor_scale_nodes, sym_scale_nodes = [], []
-    for node in fwd_outputs:
+    for position, node in enumerate(fwd_outputs):
         # check if the activation node is the node saved for quantization
         if node.meta.get("saved_for_quantization", False):
             # case: use scaling
@@ -571,11 +574,12 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
             ].get("use_scaling", True):
                 # calculating the scale
                 scale_node = calculate_quantization_scaling(
-                    graph, node, clamp_max, 1e-12
+                    graph, node, clamp_max, 1e-12, position
                 )
+
                 # converting to fp8
                 quant_node = perform_quantization(
-                    graph, node, scale_node, quant_type, clamp_min, clamp_max
+                    graph, node, scale_node, quant_type, clamp_min, clamp_max, position
                 )
                 if not is_sym_node(scale_node):
                     tensor_scale_nodes.append(scale_node)
@@ -587,7 +591,7 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
                     quant_node = graph.call_function(
                         torch.ops.prims.convert_element_type.default,
                         args=(node, quant_type),
-                        name="fp8_quant_" + str(node.name),
+                        name=f"fp8_quant_pos_{position}_{node.name}",
                     )
                     quant_node.meta["val"] = (
                         torch.ops.prims.convert_element_type.default(
@@ -597,10 +601,14 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
                     quant_node.meta["tensor_meta"] = extract_tensor_metadata(
                         quant_node.meta["val"]
                     )
-            node_to_quant[node] = quant_node
+
+            position_to_quant[position] = quant_node
+
+    # Use position-based lookup for building output
     # only update the return node args, and remain all other users unchanged
     output_updated_args = [
-        node_to_quant[node] if node in node_to_quant else node for node in fwd_outputs
+        position_to_quant[i] if i in position_to_quant else node
+        for i, node in enumerate(fwd_outputs)
     ]
     # add the scale nodes to the output find the first sym_node in the output
     idx = find_first_sym_node(output_updated_args)
@@ -738,7 +746,9 @@ def perform_fp8_activation_quantization(
     # update the corresponding bwd_inputs due to the fwd_outputs quantization
     for fwd_node in quant_fwd_module_outputs:
         if "fp8_quant_" in fwd_node.name:
-            bwd_input = bwd_module_inputs[fwd_node.name.replace("fp8_quant_", "")]
+            bwd_input = bwd_module_inputs[
+                re.sub(r"^fp8_quant_pos_\d+_", "", fwd_node.name)
+            ]
             with bwd_module.graph.inserting_after(bwd_input):
                 quant_bwd_input = bwd_module.graph.placeholder(name=fwd_node.name)
             dequant_type = bwd_input.meta["dequant_type"]
@@ -1792,7 +1802,7 @@ def ban_recomputation_if_allowed(node):
             # If someone saves a input for backward as-is and backward
             # returns that tensor as-is as a grad input, then the node x would
             # be both a required_bw_node and an input. In this case we
-            # (1) connect x_in to to the source, (2) x_out to the sink, and
+            # (1) connect x_in to the source, (2) x_out to the sink, and
             # (3) assign the proper weight to the x_in-x_out edge, so that
             # x would be part of cut nodes. A case where this happens is if
             # NestedTensor saves a offset tensor as part of the singleton int
@@ -2433,7 +2443,10 @@ def get_saved_values_knapsack(memory_budget, node_info, joint_graph):
                 saved_node_idxs=saved_node_idxs,
                 recomputable_node_idxs=recomputable_node_idxs,
                 expected_runtime=expected_runtime,
-                memories_banned_nodes=memories_banned_nodes,
+                memories_banned_nodes=[
+                    _size_of(i) for i in all_recomputable_banned_nodes
+                ],
+                normalized_memories_banned_nodes=memories_banned_nodes,
                 runtimes_banned_nodes=runtimes_banned_nodes,
                 min_cut_saved_values=saved_values,
             )
@@ -2535,7 +2548,7 @@ def has_same_nodes(joint_graph):
         # proxy to check if the graph is the same across different GPUs.
         # We only consider the name and order of nodes. A more robust way
         # would be to check the hash of the whole graph (disregarding input shapes),
-        # this is is a reasonable first-order approximation.
+        # this is a reasonable first-order approximation.
         node_str = "/".join(x.name for x in joint_graph.nodes)
         inputs = hashlib.sha256(node_str.encode("utf-8")).hexdigest()
         all_inputs = [None for _ in range(torch.distributed.get_world_size())]
diff --git a/torch/_higher_order_ops/__init__.py b/torch/_higher_order_ops/__init__.py
index e809c729dc424..516d58bdf314e 100644
--- a/torch/_higher_order_ops/__init__.py
+++ b/torch/_higher_order_ops/__init__.py
@@ -21,6 +21,7 @@
 from torch._higher_order_ops.foreach_map import _foreach_map, foreach_map
 from torch._higher_order_ops.hints_wrap import hints_wrapper
 from torch._higher_order_ops.invoke_subgraph import invoke_subgraph
+from torch._higher_order_ops.local_map import local_map_hop
 from torch._higher_order_ops.map import map
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._higher_order_ops.run_const_graph import run_const_graph
@@ -73,4 +74,5 @@
     "aoti_call_delegate",
     "map",
     "while_loop_stack_output",
+    "local_map_hop",
 ]
diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py
index fa59ee244fec1..f8b0e4ab6f74c 100644
--- a/torch/_higher_order_ops/associative_scan.py
+++ b/torch/_higher_order_ops/associative_scan.py
@@ -493,16 +493,16 @@ def combine_fn(a: torch.Tensor, b: torch.Tensor):
 
     Level 0 (Input):    xs0    xs1    xs2    xs3    xs4
                         \    /       |      |      |
-                        \  /        |      |      |
-    Level 1:               ys1 ───────┘      |      |
-                            \               /       |
+                         \  /        |      |      |
+    Level 1:              ys1 ───────┘      |      |
+                           \               /       |
                             \             /        |
-    Level 2:                  ys2 ────────┘         |
-                            \                   /
-                                \                 /
-    Level 3:                     ys3 ────────────┘
-                                \
-                                \
+    Level 2:                 ys2 ────────┘         |
+                              \                   /
+                               \                 /
+    Level 3:                    ys3 ────────────┘
+                                 \
+                                  \
     Level 4:                        ys4
 
 
@@ -510,17 +510,17 @@ def combine_fn(a: torch.Tensor, b: torch.Tensor):
 
 
     Level 0 (output):   g_xs0   g_xs1   g_xs2   g_xs3   g_xs4
-                        \      /       |       |     |
-                        \    /        |       |     |
-    Level 1:    gl_ys1  ─> g_ys1  ──────┘       |     |
-                            \                  /      |
-                            \                /       |
-    Level 2:    gl_ys2     ─> g_ys2  ────────┘        |
-                            \                     /
-                                \                   /
-    Level 3:    gl_ys3        ─> g_ys3  ───────────┘
-                                \
-                                \
+                         \      /       |       |       |
+                          \    /        |       |       |
+    Level 1:    gl_ys1  ─> g_ys1  ──────┘       |       |
+                            \                  /        |
+                             \                /         |
+    Level 2:    gl_ys2     ─> g_ys2  ────────┘          |
+                               \                       /
+                                \                    /
+    Level 3:    gl_ys3        ─> g_ys3  ────────────┘
+                                  \
+                                   \
     Level 4:    gl_ys4           ─> g_ys4,
 
     where gl_y1 is the gradient of the loss with respect to ys1 and the input of backward.
diff --git a/torch/_higher_order_ops/auto_functionalize.py b/torch/_higher_order_ops/auto_functionalize.py
index d5aa0d09c8b18..d8374c356ab2e 100644
--- a/torch/_higher_order_ops/auto_functionalize.py
+++ b/torch/_higher_order_ops/auto_functionalize.py
@@ -508,7 +508,7 @@ def do_auto_functionalize(
             normalized_kwargs[arg.name] = kwargs[arg.name]
         elif idx < len(args):
             # if its out of bounds we don't need to do anything
-            # as it means the the optional arg was passed with its default
+            # as it means the optional arg was passed with its default
             # value
             normalized_kwargs[arg.name] = args[idx]
         else:
@@ -625,7 +625,7 @@ def _functionalize_callable(arg: Any):
             normalized_kwargs[arg.name] = kwargs[arg.name]
         elif idx < len(args):
             # if its out of bounds we don't need to do anything
-            # as it means the the optional arg was passed with its default
+            # as it means the optional arg was passed with its default
             # value
             normalized_kwargs[arg.name] = args[idx]
         else:
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index 2d352ae03a45c..3e0d7921e1682 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -505,13 +505,6 @@ def flex_attention_fake_impl(
     ):
         return NotImplemented
 
-    # TODO: Figure out a better way to handle this for NJT than using sum()
-    if query.is_nested:
-        out = torch.empty_like(query, memory_format=torch.contiguous_format)
-        logsumexp = query.sum(dim=-1)
-        max_scores = query.max(dim=-1)[0]
-        return out, logsumexp, max_scores
-
     v_head_dim = value.size(-1)
     batch_size, num_heads, seq_len_q, _q_head_dim = query.shape
     logsumexp = query.new_empty(batch_size, num_heads, seq_len_q, dtype=torch.float32)
@@ -1266,7 +1259,7 @@ def flex_attention_backward_fake_tensor_mode(
         [
             (
                 torch.empty_like(buffer, memory_format=torch.contiguous_format)
-                if isinstance(buffer, torch.Tensor) and buffer.requires_grad
+                if isinstance(buffer, torch.Tensor)
                 else None
             )
             for buffer in score_mod_other_buffers
diff --git a/torch/_higher_order_ops/local_map.py b/torch/_higher_order_ops/local_map.py
new file mode 100644
index 0000000000000..7b3a4db249a6b
--- /dev/null
+++ b/torch/_higher_order_ops/local_map.py
@@ -0,0 +1,349 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+# NOTE: this file may be removed once we move to a dynamo frontend
+
+import functools
+from collections.abc import Generator
+from contextlib import contextmanager
+from typing import Any, Callable, Optional
+
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import (
+    clone_outputs_aliasing_inputs,
+    redirect_to_mode,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx import GraphModule
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.utils.checkpoint import _CachedTorchDispatchMode, _CachingTorchDispatchMode
+
+
+# Proxy the HOP instead of inlining into it
+_DEFER_INLINING = False
+
+
+@contextmanager
+def defer_inlining() -> Generator[None, None, None]:
+    global _DEFER_INLINING
+    prior = _DEFER_INLINING
+    try:
+        _DEFER_INLINING = True
+        yield
+    finally:
+        _DEFER_INLINING = prior
+
+
+class LocalMapHOP(HigherOrderOperator):
+    def __init__(self) -> None:
+        super().__init__("local_map_hop")
+
+    def __call__(self, fw_gm: GraphModule, *args: Any, **kwargs: Any) -> Any:
+        return super().__call__(fw_gm, *args, **kwargs)
+
+
+local_map_hop = LocalMapHOP()
+
+# Registers dispatches for SAC
+redirect_to_mode(local_map_hop, _CachingTorchDispatchMode)
+redirect_to_mode(local_map_hop, _CachedTorchDispatchMode)
+
+
+def create_hop_fw_bw(
+    fw_gm: GraphModule,
+    *_args: Any,
+) -> tuple[GraphModule, GraphModule, int, int, set[int]]:
+    """
+    Traces a joint, applies passes and partitions it
+    """
+    # Keeping these imports here
+    # Avoid circular dependencies once we upstream with dynamo frontend
+    from torch._dispatch.python import suspend_functionalization
+    from torch._functorch.aot_autograd import AOTConfig, create_joint
+    from torch._guards import detect_fake_mode
+    from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+    from torch._subclasses.functional_tensor import disable_functional_mode
+    from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing, make_fx
+
+    dummy_aot_config = AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        num_params_buffers=0,
+        aot_id=0,
+        keep_inference_input_mutations=False,
+    )
+
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            # create a tensor (fake) from a compiler wrapped FunctionalTensor
+            def _from_fun(t: Any) -> Any:
+                if isinstance(t, torch.Tensor):
+                    return torch.empty_strided(
+                        t.size(),
+                        t.stride(),
+                        device=t.device,
+                        dtype=t.dtype,
+                        requires_grad=t.requires_grad,
+                    )
+                return t
+
+            # If someone runs this hop under the default compiler backend ("eager")
+            # Then this path will be run with the actual user inputs. We convert them
+            # to fake tensors in order to not perform any actual compute.
+
+            fake_mode = detect_fake_mode(_args)
+            if fake_mode is None:
+                fake_mode = FakeTensorMode(allow_non_fake_inputs=True)
+
+            with fake_mode:
+                fw_inputs = pytree.tree_map(_from_fun, _args)
+
+            assert all(
+                isinstance(t, (FakeTensor, int, torch.SymInt)) for t in fw_inputs
+            ), f"Unexpected element in {fw_inputs=}"
+
+            example_grads = pytree.tree_map(
+                _from_fun,
+                fw_gm(*fw_inputs),
+            )
+            if not isinstance(example_grads, (list, tuple)):
+                example_grads = [example_grads]
+
+            num_fw_inputs = len(fw_inputs)
+            num_fw_outputs = len(example_grads)
+
+        def joint_f(
+            *primals_and_tangents: list[torch.Tensor],
+        ) -> Any:
+            primals = primals_and_tangents[:num_fw_inputs]
+            tangents = primals_and_tangents[num_fw_inputs:]
+
+            def prepare_fw_with_masks(fn: Callable[..., Any]) -> Callable[..., Any]:
+                def fw_with_masks(*args: Any) -> tuple[tuple[Any], list[bool]]:
+                    fw_out = fn(*args)
+                    assert isinstance(fw_out, tuple), (
+                        "Dynamo traced submodule should return tuple"
+                    )
+                    return fw_out, [
+                        True
+                        if isinstance(ret, torch.Tensor) and ret.requires_grad
+                        else False
+                        for ret in fw_out
+                    ]
+
+                return fw_with_masks
+
+            fw_outs, grads = create_joint(
+                prepare_fw_with_masks(fw_gm), aot_config=dummy_aot_config
+            )(primals, tangents)
+
+            maybe_clone = clone_outputs_aliasing_inputs(primals_and_tangents)
+            # put grads first to work with existing hop utils
+            return pytree.tree_map(maybe_clone, (*grads, *fw_outs))
+
+        filtered_grads_idx = set()
+        for i, example_grad in enumerate(example_grads):
+            # Filter out grads that are None or do not require_grad.
+            # The AOTAutograd utils we rely on force this assumption.
+            # We must also filter the runtime tangents too.
+            if example_grad is not None and (
+                isinstance(example_grad, torch.Tensor) and example_grad.requires_grad
+            ):
+                filtered_grads_idx.add(i)
+
+        primals_and_tangents = [
+            *fw_inputs,
+            *[example_grads[i] for i in filtered_grads_idx],
+        ]
+        joint_hop_gm = make_fx(joint_f)(*primals_and_tangents)
+
+        from torch._functorch._aot_autograd.graph_compile import prepare_for_partitioner
+        from torch._inductor.compile_fx import partition_fn
+
+        # Match partitioner convention
+        prepped_joint_hop_gm = prepare_for_partitioner(
+            joint_hop_gm, num_fw_inputs, num_fw_outputs
+        )
+        # Also runs joint passes
+        new_fw_gm, new_bw_gm = partition_fn(
+            prepped_joint_hop_gm,
+            [],
+            num_fwd_outputs=num_fw_outputs,
+            static_lifetime_input_indices=[],
+        )
+
+        # Propagate meta onto fw/bw graphs, later will be set on proxied nodes
+        local_map_kwargs = fw_gm.meta["local_map_kwargs"]  # type: ignore[attr-defined]
+
+        new_fw_gm.meta["local_map_kwargs"] = local_map_kwargs
+        new_bw_gm.meta["local_map_kwargs"] = {**local_map_kwargs}
+        # Okay because Autoparallel assumes same sharding between param and grads
+        new_bw_gm.meta["local_map_kwargs"]["in_placements"] = local_map_kwargs[
+            "out_placements"
+        ]
+        new_bw_gm.meta["local_map_kwargs"]["out_placements"] = local_map_kwargs[
+            "in_placements"
+        ]
+
+        return new_fw_gm, new_bw_gm, num_fw_inputs, num_fw_outputs, filtered_grads_idx
+
+
+class LocalMapAutogradOp(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        fw_gm: GraphModule,
+        bw_gm: GraphModule,
+        num_fw_ins: int,
+        num_fw_outs: int,
+        filtered_grads_idx: set[int],
+        *args: Any,
+        **kwargs: Any,
+    ) -> tuple[Optional[torch.Tensor], ...]:
+        from torch._functorch._aot_autograd.schemas import MemoryFormatMeta
+
+        ctx.bw_gm = bw_gm
+        ctx.num_fw_ins = num_fw_ins
+        ctx.filtered_grads_idx = filtered_grads_idx
+
+        with torch._C._AutoDispatchBelowAutograd():
+            fw_outs_with_saved_activations = local_map_hop(fw_gm, *args, **kwargs)
+
+        fw_outs = fw_outs_with_saved_activations[:num_fw_outs]
+        saved_activations = fw_outs_with_saved_activations[num_fw_outs:]
+        save_tensors_and_symints_for_backward(ctx, saved_activations)
+
+        ctx.expected_tangent_metadata = {
+            i: MemoryFormatMeta.from_tensor(fw_outs[i]) for i in filtered_grads_idx
+        }
+        return fw_outs
+
+    @staticmethod
+    def backward(
+        ctx: Any, *_grads: tuple[torch.Tensor]
+    ) -> tuple[Optional[torch.Tensor], ...]:
+        from torch._functorch._aot_autograd.runtime_wrappers import (
+            coerce_to_expected_memory_format,
+        )
+
+        saved_activations = saved_tensors_and_symints(ctx)
+        with torch._C._AutoDispatchBelowAutograd():
+            # Filter out grads that are None or do not require_grad.
+            # The AOTAutograd utils we rely on force this assumption.
+            grads = [_grads[i] for i in ctx.filtered_grads_idx]
+            assert len(grads) == len(ctx.expected_tangent_metadata), (
+                f"{len(grads)=} vs {len(ctx.expected_tangent_metadata)}"
+            )
+
+            for i, meta in ctx.expected_tangent_metadata.items():
+                grads[i] = coerce_to_expected_memory_format(grads[i], meta)
+
+            grad_ins = local_map_hop(ctx.bw_gm, *saved_activations, *grads)
+            if len(grad_ins) != ctx.num_fw_ins:
+                raise RuntimeError(
+                    f"Expected {ctx.num_fw_ins} grad_ins, got {len(grad_ins)}"
+                )
+        return None, None, None, None, None, *grad_ins
+
+
+@local_map_hop.py_impl(torch._C.DispatchKey.Autograd)
+def autograd_key(
+    fw_gm: GraphModule,
+    *args: Any,
+    **kwargs: Any,
+) -> Any:
+    if _DEFER_INLINING:
+        fw_gm, bw_gm, num_fw_ins, num_fw_outs, filtered_grads_idx = create_hop_fw_bw(
+            fw_gm, *args
+        )
+        return LocalMapAutogradOp.apply(
+            fw_gm, bw_gm, num_fw_ins, num_fw_outs, filtered_grads_idx, *args, **kwargs
+        )
+
+    return fw_gm(*args, **kwargs)
+
+
+@local_map_hop.py_functionalize_impl
+def functional_mode_key(
+    ctx: Any, fw_gm: GraphModule, *args: Any, **kwargs: Any
+) -> tuple[torch.Tensor]:
+    assert not kwargs
+
+    unwrapped_inputs = ctx.unwrap_tensors(args)
+    with ctx.redispatch_to_next():
+        out = local_map_hop(fw_gm, *unwrapped_inputs)
+        return ctx.wrap_tensors(out)
+
+
+@local_map_hop.py_impl(FakeTensorMode)
+def fake_mode_key(
+    mode: FakeTensorMode,
+    fw_gm: GraphModule,
+    *args: Any,
+    **kwargs: Any,
+) -> tuple[torch.Tensor]:
+    with mode:
+        return fw_gm(*args, **kwargs)
+
+
+def proxy_mode_key_common(
+    call_hop: Callable[..., Any],
+    proxy_mode: ProxyTorchDispatchMode,
+    gm: GraphModule,
+    *args: Any,
+    **kwargs: Any,
+) -> tuple[torch.Tensor]:
+    assert proxy_mode is not None, (
+        "Mode should always be enabled for python fallback key"
+    )
+    assert len(kwargs) == 0
+
+    example_out = call_hop(*args, **kwargs)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)  # type: ignore[union-attr]
+
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", call_hop, proxy_args, {}
+    )
+
+    # extract local_map args, post-dispatch operates on GraphModules
+    assert gm.meta["local_map_kwargs"]
+    local_map_kwargs = gm.meta["local_map_kwargs"]
+
+    # propagate local_map args to the call_function node
+    out_proxy.node.meta["local_map_kwargs"] = local_map_kwargs
+    return track_tensor_tree(
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
+
+
+@local_map_hop.py_impl(ProxyTorchDispatchMode)
+def proxy_mode_key(
+    proxy_mode: ProxyTorchDispatchMode,
+    fw_gm: GraphModule,
+    *args: Any,
+    **kwargs: Any,
+) -> tuple[torch.Tensor]:
+    # TODO: get rid of this when we can install as a subgraph
+    def call_local_map(*_args: Any, **_kwargs: Any) -> Any:
+        return functools.partial(local_map_hop, fw_gm)(*_args, **_kwargs)
+
+    return proxy_mode_key_common(call_local_map, proxy_mode, fw_gm, *args, **kwargs)
+
+
+# Running HOP in eager with real tensors
+@local_map_hop.py_impl(DispatchKey.CompositeExplicitAutograd)
+def real_impl(
+    fw_gm: GraphModule,
+    *args: Any,
+    **kwargs: Any,
+) -> tuple[torch.Tensor]:
+    return fw_gm(*args, **kwargs)
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
index 8e9ca0503402c..d16511cc04f32 100644
--- a/torch/_higher_order_ops/wrap.py
+++ b/torch/_higher_order_ops/wrap.py
@@ -52,8 +52,11 @@ def __call__(self, enable_grad, wrapped_func, *args, **kwargs):
 
         @disable
         def wrapper():
-            with torch.set_grad_enabled(enable_grad):
-                return wrapped_func(*args, **kwargs)
+            prev = torch.is_grad_enabled()
+            torch.set_grad_enabled(enable_grad)
+            res = wrapped_func(*args, **kwargs)
+            torch.set_grad_enabled(prev)
+            return res
 
         return wrapper()
 
diff --git a/torch/_inductor/async_compile.py b/torch/_inductor/async_compile.py
index 9f941c04e7b38..9ab6c68a77837 100644
--- a/torch/_inductor/async_compile.py
+++ b/torch/_inductor/async_compile.py
@@ -521,6 +521,11 @@ def multi_kernel(self, *args, **kwargs) -> Any:
         # no need to call this in parallel since the sub-kernels are already parallel tasks
         return MultiKernelCall(*args, **kwargs)
 
+    def size_hint_multi_kernel(self, *args, **kwargs) -> Any:
+        from torch._inductor.codegen.multi_kernel import SizeHintMultiKernelCall
+
+        return SizeHintMultiKernelCall(*args, **kwargs)
+
     def cpp(self, source_code: str):
         kernel_code_log.info("CPP Kernel:\n%s", source_code)
         if get_compile_threads() <= 1:
diff --git a/torch/_inductor/cache.py b/torch/_inductor/cache.py
new file mode 100644
index 0000000000000..aff54a126b31c
--- /dev/null
+++ b/torch/_inductor/cache.py
@@ -0,0 +1,417 @@
+from __future__ import annotations
+
+import pickle
+from abc import ABC, abstractmethod
+from ast import literal_eval
+from functools import cached_property
+from hashlib import sha256
+from os import getenv
+from pathlib import Path
+from tempfile import gettempdir
+from threading import Lock
+from typing import Any, Generic, TYPE_CHECKING, TypeVar
+from typing_extensions import assert_never, override, Self
+
+from torch.utils._filelock import FileLock
+
+
+if TYPE_CHECKING:
+    from concurrent.futures import Future, ThreadPoolExecutor
+
+
+# TypeVars can't be recursive, so generic types that fall within
+# Key or Value can't be bound properly; for example, Key should
+# only take tuples of other Key types: tuple[Key, ...]. this is
+# a known shortcoming of torch's typing
+Key = TypeVar("Key", str, int, tuple[Any, ...])
+Value = TypeVar("Value", str, int, tuple[Any, ...], bytes, dict[Any, Any], list[Any])
+
+
+class CacheError(ValueError):
+    """
+    Exception raised for errors encountered during cache operations.
+    """
+
+
+class Cache(ABC, Generic[Key, Value]):
+    """
+    Abstract base class for cache implementations.
+    Provides the interface for cache operations.
+    """
+
+    @abstractmethod
+    def get(self: Self, key: Key) -> Value | None:
+        """
+        Retrieve a value from the cache.
+        Args:
+            key (Key): The key to look up.
+        Returns:
+            Value | None: The cached value if present, else None.
+        """
+
+    @abstractmethod
+    def insert(self: Self, key: Key, value: Value) -> bool:
+        """
+        Insert a value into the cache.
+        Args:
+            key (Key): The key to insert.
+            value (Value): The value to associate with the key.
+        Returns:
+            bool: True if the value was inserted, False if the key already exists.
+        """
+
+
+class InMemoryCache(Cache[Key, Value]):
+    """
+    In-memory cache implementation using a dictionary and thread lock.
+    """
+
+    def __init__(self: Self) -> None:
+        """
+        Initialize an empty in-memory cache.
+        """
+        self._cache: dict[Key, Value] = {}
+        self._lock: Lock = Lock()
+
+    def get(self: Self, key: Key) -> Value | None:
+        """
+        Retrieve a value from the cache.
+        Args:
+            key (Key): The key to look up.
+        Returns:
+            Value | None: The cached value if present, else None.
+        """
+        with self._lock:
+            if (value := self._cache.get(key)) is not None:
+                return value
+            return None
+
+    def insert(self: Self, key: Key, value: Value) -> bool:
+        """
+        Insert a value into the cache.
+        Args:
+            key (Key): The key to insert.
+            value (Value): The value to associate with the key.
+        Returns:
+            bool: True if the value was inserted, False if the key already exists.
+        """
+        with self._lock:
+            if key in self._cache:
+                # no overwrites for insert!
+                return False
+            self._cache[key] = value
+            return True
+
+    @classmethod
+    def from_env_var(cls, env_var: str) -> Self:
+        """
+        Create an in-memory cache from an environment variable.
+        Args:
+            env_var (str): Name of the environment variable containing cache data.
+        Returns:
+            InMemoryCache: An instance populated from the environment variable.
+        Raises:
+            CacheError: If the environment variable is malformed or contains invalid data.
+        """
+        cache = cls()
+
+        if (env_val := getenv(env_var)) is None:
+            # env_var doesn't exist = empty cache
+            return cache
+
+        for kv_pair in env_val.split(";"):
+            # ignore whitespace prefix/suffix
+            kv_pair = kv_pair.strip()
+
+            if not kv_pair:
+                # kv_pair could be '' if env_val is '' or has ; suffix
+                continue
+
+            try:
+                # keys and values should be comma separated
+                key_bytes_repr, value_bytes_repr = kv_pair.split(",", 1)
+            except ValueError as err:
+                raise CacheError(
+                    f"Malformed kv_pair {kv_pair!r} from env_var {env_var!r}, likely missing comma separator."
+                ) from err
+
+            # ignore whitespace prefix/suffix, again
+            key_bytes_repr, value_bytes_repr = (
+                key_bytes_repr.strip(),
+                value_bytes_repr.strip(),
+            )
+
+            try:
+                # check that key_bytes_str is an actual, legitimate encoding
+                key_bytes = literal_eval(key_bytes_repr)
+            except (ValueError, SyntaxError) as err:
+                raise CacheError(
+                    f"Malformed key_bytes_repr {key_bytes_repr!r} in kv_pair {kv_pair!r}, encoding is invalid."
+                ) from err
+            try:
+                # check that value_bytes_str is an actual, legitimate encoding
+                value_bytes = literal_eval(value_bytes_repr)
+            except (ValueError, SyntaxError) as err:
+                raise CacheError(
+                    f"Malformed value_bytes_repr {value_bytes_repr!r} in kv_pair {kv_pair!r}, encoding is invalid."
+                ) from err
+
+            try:
+                key = pickle.loads(key_bytes)
+            except pickle.UnpicklingError as err:
+                raise CacheError(
+                    f"Malformed key_bytes_repr {key_bytes_repr!r} in kv_pair {kv_pair!r}, not un-pickle-able."
+                ) from err
+            try:
+                value = pickle.loads(value_bytes)
+            except pickle.UnpicklingError as err:
+                raise CacheError(
+                    f"Malformed value_bytes_repr {value_bytes_repr!r} in kv_pair {kv_pair!r}, not un-pickle-able."
+                ) from err
+
+            # true duplicates, i.e. multiple occurrences of the same key => value
+            # mapping are ok and treated as a no-op; key duplicates with differing
+            # values, i.e. key => value_1 and key => value_2 where value_1 != value_2,
+            # are not okay since we don't allow overwriting cached values (it's bad regardless)
+            if (not cache.insert(key, value)) and (cache.get(key) != value):
+                raise CacheError(
+                    f"Multiple values for key {key!r} found, got {cache.get(key)!r} and {value!r}."
+                )
+
+        return cache
+
+    @classmethod
+    def from_file_path(cls, fpath: Path) -> Self:
+        """
+        Create an in-memory cache from a file path.
+        Args:
+            fpath (Path): Path to the file containing pickled cache data.
+        Returns:
+            InMemoryCache: An instance populated from the file.
+        Raises:
+            CacheError: If the file is not a valid pickled dictionary.
+        """
+        cache = cls()
+
+        if not fpath.is_file():
+            # fpath doesn't exit = empty cache
+            return cache
+
+        try:
+            with open(fpath, "rb") as fp:
+                cache._cache = pickle.load(fp)
+        except pickle.UnpicklingError as err:
+            raise CacheError(
+                f"Failed to create cache from file path {fpath}, file contents are un-pickle-able."
+            ) from err
+
+        if not isinstance(cache._cache, dict):
+            raise CacheError(
+                f"Failed to create cache from file path {fpath}, file contents not pickled dict[Key, Value]."
+            )
+
+        return cache
+
+
+class AsyncCache(Cache[Key, Value]):
+    """
+    Asynchronous cache implementation using ThreadPoolExecutor.
+    """
+
+    def get_async(
+        self: Self, key: Key, executor: ThreadPoolExecutor
+    ) -> Future[Value | None]:
+        """
+        Retrieve a value from the cache asynchronously.
+        Args:
+            key (Key): The key to look up.
+            executor (ThreadPoolExecutor): Executor for async execution.
+        Returns:
+            Future[Value | None]: Future for the cached value or None.
+        """
+        return executor.submit(self.get, key)
+
+    def insert_async(
+        self: Self, key: Key, value: Value, executor: ThreadPoolExecutor
+    ) -> Future[bool]:
+        """
+        Insert a value into the cache asynchronously.
+        Args:
+            key (Key): The key to insert.
+            value (Value): The value to associate with the key.
+            executor (ThreadPoolExecutor): Executor for async execution.
+        Returns:
+            Future[bool]: Future for the result of insertion.
+        """
+        return executor.submit(self.insert, key, value)
+
+
+class OnDiskCache(AsyncCache[Key, Value]):
+    """
+    On-disk cache implementation using files and file locks.
+    Stores cache data in files on disk, with atomic operations and versioning.
+    Supports custom cache directory names.
+    Attributes:
+        version (int): The version used for cache versioning.
+        name (str): The name of the cache directory.
+    """
+
+    version: int = 0
+
+    def __init__(self: Self, name: str | None = None) -> None:
+        """
+        Initialize an on-disk cache instance.
+        Args:
+            name (str | None, optional): The name of the cache directory. If None,
+                defaults to "on_disk_cache".
+        """
+        self.name = name or "on_disk_cache"
+
+    @cached_property
+    def base_dir(self: Self) -> Path:
+        """
+        Get the base directory for the cache.
+        Returns:
+            Path: The base directory path for storing cache files.
+        """
+        return Path(gettempdir()) / "cache" / self.name
+
+    def _fpath_from_key(self: Self, key: Key) -> Path:
+        """
+        Get the file path for a given key.
+        Args:
+            key (Key): The key to convert to a file path.
+        Returns:
+            Path: The file path for the key.
+        Raises:
+            CacheError: If the key is not pickle-able.
+        """
+        try:
+            return self.base_dir / sha256(pickle.dumps(key)).hexdigest()[:32]
+        except (AttributeError, pickle.PicklingError) as err:
+            raise CacheError(
+                f"Failed to get fpath for key {key!r}, key is not pickle-able."
+            ) from err
+        assert_never(key)
+
+    def _flock_from_fpath(self: Self, fpath: Path) -> FileLock:
+        """
+        Get a file lock for a given file path.
+        Args:
+            fpath (Path): The file path.
+        Returns:
+            FileLock: The file lock for the path.
+        """
+        # fpath.name is a hex digest, meaning there are 16^4 potential values
+        # for fpath.name[:4]; this is more than enough unique locks to not
+        # cause additional overhead from shared locks and it also saves our
+        # cache dir from becoming 50 percent locks
+        return FileLock(str(fpath.parent / "locks" / fpath.name[:4]) + ".lock")
+
+    @property
+    def version_prefix(self: Self) -> bytes:
+        """
+        Get the version prefix for the cache.
+        Returns:
+            bytes: The version prefix as bytes, derived from the cache version string.
+        """
+        return sha256(str(OnDiskCache.version).encode()).digest()[:4]
+
+    @override
+    def get(self: Self, key: Key) -> Value | None:
+        """
+        Retrieve a value from the cache.
+        Args:
+            key (Key): The key to look up.
+        Returns:
+            Value | None: The cached value if present and version matches, else None.
+        Raises:
+            CacheError: If the value is corrupted or cannot be unpickled.
+        Side Effects:
+            Removes stale cache files if the version prefix does not match.
+        """
+        fpath = self._fpath_from_key(key)
+        flock = self._flock_from_fpath(fpath)
+
+        with flock:
+            if not fpath.is_file():
+                return None
+
+            value_bytes = None
+            prefix_length = len(self.version_prefix)
+            with open(fpath, "rb") as fp:
+                if fp.read(prefix_length) == self.version_prefix:
+                    value_bytes = fp.read()
+
+            if value_bytes is None:
+                # version_prefix did not match, so we can't read the stale
+                # cached value; we should also remove the stale cached value,
+                # so that key can be re-cached by the newer version
+                fpath.unlink()
+                return None
+
+            try:
+                value = pickle.loads(value_bytes)
+            except pickle.UnpicklingError as err:
+                raise CacheError(
+                    f"Failed to get key {key!r}, value is potentially corrupted (value is not un-pickle-able)."
+                ) from err
+
+            return value
+
+    @override
+    def insert(self: Self, key: Key, value: Value) -> bool:
+        """
+        Insert a value into the cache.
+        Args:
+            key (Key): The key to insert.
+            value (Value): The value to associate with the key.
+        Returns:
+            bool: True if the value was inserted, False if the key already exists.
+        Raises:
+            CacheError: If the value is not pickle-able.
+        Side Effects:
+            Creates the cache directory if it does not exist.
+        """
+        fpath = self._fpath_from_key(key)
+        flock = self._flock_from_fpath(fpath)
+        fpath.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            # "x" mode is exclusive creation, meaning the file will be created
+            # iff the file does not already exist (atomic w/o overwrite); use
+            # flock for added atomicity guarantee and to prevent partial writes
+            with flock as _, open(fpath, "xb") as fp:
+                fp.write(self.version_prefix)
+                pickle.dump(value, fp)
+        except pickle.PicklingError as err:
+            raise CacheError(
+                f"Failed to insert key {key!r} with value {value!r}, value is not pickle-able."
+            ) from err
+        except FileExistsError:
+            return False
+        return True
+
+
+class InductorOnDiskCache(OnDiskCache[Key, Value]):
+    """
+    Inductor-specific on-disk cache implementation.
+    Uses a custom base directory for Inductor cache files.
+    """
+
+    def __init__(self: Self) -> None:
+        """
+        Initialize an inductor on-disk cache instance.
+        Sets the cache directory name to "inductor_on_disk_cache".
+        """
+        super().__init__("inductor_on_disk_cache")
+
+    @cached_property
+    def base_dir(self: Self) -> Path:
+        """
+        Get the base directory for the Inductor cache.
+        Returns:
+            Path: The base directory path for Inductor cache files.
+        """
+        from torch._inductor.runtime.runtime_utils import default_cache_dir
+
+        return Path(default_cache_dir(), "cache", self.name)
diff --git a/torch/_inductor/choices.py b/torch/_inductor/choices.py
index a6275ac85c110..a2f1f05183b13 100644
--- a/torch/_inductor/choices.py
+++ b/torch/_inductor/choices.py
@@ -6,6 +6,8 @@
 import sympy
 
 import torch
+from torch._inductor.runtime.runtime_utils import next_power_of_2
+from torch.utils._sympy.value_ranges import bound_sympy
 
 from . import config
 from .codecache import write_text
@@ -14,6 +16,7 @@
 from .metrics import get_metric_table, is_metric_table_enabled
 from .runtime.hints import DeviceProperties, ReductionHint
 from .scheduler import BaseSchedulerNode, Scheduler, WhyNoFuse
+from .select_algorithm import ExternKernelChoice
 from .template_heuristics import get_template_heuristic
 from .template_heuristics.triton import (
     BaseConfigHeuristic,
@@ -23,6 +26,7 @@
     ROCmConfigHeuristic,
     XPUConfigHeuristic,
 )
+from .utils import _use_autotune_backend
 from .virtualized import V
 
 
@@ -32,14 +36,13 @@
 
     from triton import Config as TritonConfig
 
-    from torch.utils._ordered_set import OrderedSet
-
     from .codegen.common import KernelTemplate
     from .codegen.simd_kernel_features import SIMDKernelFeatures
     from .codegen.triton import TritonKernel
-    from .ir import ChoiceCaller, Layout
+    from .ir import ChoiceCaller
     from .kernel_template_choice import KernelTemplateChoice
-    from .select_algorithm import ExternKernelChoice
+
+    from torch.utils._ordered_set import OrderedSet  # isort: skip
 
 
 class Sortable(typing.Protocol):
@@ -105,11 +108,10 @@ def get_flex_decode_configs(
         flex_heuristics = self.get_config_heuristics(device_type)
         return flex_heuristics.get_flex_decode_configs(head_dim, dtype)
 
-    def _finalize_mm_configs(
+    def _finalize_template_configs(
         self,
         template_choices: dict[str, Generator[KernelTemplateChoice, None, None]],
         kernel_inputs: KernelInputs,
-        layout: Any,
         templates: list[Union[KernelTemplate, ExternKernelChoice]],
         op_name: str,
         kwarg_overrides: Optional[dict[str, dict[str, Any]]] = None,
@@ -126,7 +128,6 @@ def _finalize_mm_configs(
         Args:
             template_choices: Dictionary mapping template UIDs to generators of KernelTemplateChoice objects
             kernel_inputs: MMKernelInputs containing input tensor nodes and matrix indices
-            layout: Output layout
             templates: List of template objects (KernelTemplate or ExternKernelChoice) in use
             op_name: Operation name (e.g., "bmm", "baddbmm", "addmm")
             kwarg_overrides: Optional dict of kwargs to override for each template heuristic
@@ -142,7 +143,6 @@ def _finalize_mm_configs(
     def get_ktc(
         self,
         kernel_inputs: KernelInputs,
-        layout: Layout,
         template: Union[KernelTemplate, ExternKernelChoice],
         op_name: str,
         kwarg_overrides: Optional[dict[str, Any]] = None,
@@ -150,12 +150,12 @@ def get_ktc(
         """
         Utility to get the KernelTemplateChoice generator for a specific input.
 
-        This is a per template/op call, whereas get_mm_configs is an op wide call (all templates).
+        This is a per template/op call, whereas get_template_configs is an op wide call (all templates).
         Consider when overriding/using at which level you need to make decisions
         """
         # Extract device_type from kernel_inputs
         device_type = kernel_inputs.device_type
-        assert device_type is not None, "get_mm_configs requires a valid device type"
+        assert device_type is not None, "get_ktc requires a valid device type"
         # Extract template_name from the template object
         template_name = template.uid
 
@@ -165,27 +165,81 @@ def get_ktc(
             kernel_inputs,
             op_name,
         )
-        extra_kwargs = heuristic.get_extra_kwargs(kernel_inputs, op_name)
         # adjust the kernel inputs to the template-specific heuristic, if needed
         # default here is to just return the kernel_inputs as is
         inputs_val = heuristic.adjust_kernel_inputs(kernel_inputs, op_name)
+        extra_kwargs = heuristic.get_extra_kwargs(kernel_inputs, op_name)
         # Create KernelTemplateChoice generator using the moved function
         overrides = kwarg_overrides or {}
         return make_ktc_generator(
             template=template,
             cs=cs,
-            overrides=overrides,
             extra_kwargs=extra_kwargs,
-            layout=layout,
+            overrides=overrides,
+            layout=kernel_inputs.output_layout(),
             inputs=inputs_val,
         )
 
-    def get_mm_configs(
+    def _need_to_fix_layout(
+        self,
+        adjusted_choices: list[KernelTemplateChoice],
+        op_name: str,
+    ) -> bool:
+        """
+        Check if we need to fix the layout instead of keeping it flexible
+
+        Args:
+            ktc: KernelTemplateChoice object
+
+        Returns:
+            True if we need to fix the layout, False otherwise
+        """
+        # TODO: debug and fix
+        # NOTE: on mps, we see issues with flexible layouts on baddmm. This check just makes sure
+        # that for mps, everything stays as it was before this optimization
+        if len(adjusted_choices) > 0:
+            if adjusted_choices[0].inputs.device_type == "mps" and op_name not in [
+                "mm",
+                "addmm",
+            ]:
+                return True
+
+        # Since the following backends are not using get_mm_configs yet through the singular call,
+        if not (config.max_autotune or config.max_autotune_gemm):
+            # no danger of using other backends than ATEN
+            if not config.max_autotune_allow_flexible_layouts and op_name not in [
+                # The historical implementation for mm and addmm allowed had flexible layouts in the
+                # not max-autotune world
+                "mm",
+                "addmm",
+            ]:
+                # TODO: deprecate this by migrating users to the new behavior
+                return True
+            return False
+
+        if not config.max_autotune_allow_flexible_layouts:
+            # we always need to fix the layout
+            return True
+
+        # Since the following backends are not using get_template_configs yet through the singular call,
+        # we don't know if they are a valid choice or not. Instead, just skip the optimization
+        # defensively.
+        # TODO(coconutruben): remove this once CPP,CK,CUTLASS are supported
+        if _use_autotune_backend("CUTLASS"):
+            return True
+        if _use_autotune_backend("CK") or _use_autotune_backend("CKTILE"):
+            return True
+        if _use_autotune_backend("CPP"):
+            return True
+        return any(
+            not isinstance(ktc.template, ExternKernelChoice) for ktc in adjusted_choices
+        )
+
+    def get_template_configs(
         self,
         kernel_inputs: KernelInputs,
         templates: list[Union[KernelTemplate, ExternKernelChoice]],
         op_name: str,
-        layout: Optional[Layout] = None,
         kwarg_overrides: Optional[dict[str, dict[str, Any]]] = None,
     ) -> list[ChoiceCaller]:
         """
@@ -206,38 +260,36 @@ def get_mm_configs(
         input_tensors = kernel_inputs.nodes()
         if len(input_tensors) < 2:
             raise ValueError(f"Need at least 2 input tensors, got {len(input_tensors)}")
-        if layout is None:
-            # TODO(coconutruben): remove this once we remove the layout argument entirely
-            # This is just here to the brief gap between commits where we still need this
-            # to accommodate fixed vs flexible layout decision externally
-            layout = kernel_inputs.output_layout(flexible=False)
+        layout = kernel_inputs.output_layout()
         # First pass: Create dict of template.uid to generator of KernelTemplateChoice objects
         template_choices = {}
         for template in templates:
             template_choices[template.uid] = self.get_ktc(
                 kernel_inputs,
-                layout,
                 template,
                 op_name,
                 kwarg_overrides.get(template.uid, {}),
             )
 
         # Second pass: Adjust the template choices
-        adjusted_choices = self._finalize_mm_configs(
+        adjusted_choices = self._finalize_template_configs(
             template_choices,
             kernel_inputs,
-            layout,
             templates,
             op_name,
             kwarg_overrides,
         )
-        choices = []
-        # Third pass: Get adjusted choices and collect non-None ChoiceCaller objects
-        for ktc in adjusted_choices:
-            if ktc.choice is not None:
-                choices.append(ktc.choice)
-
-        return choices
+        # Layout optimization: if all choices are ExternKernelChoice and layout is FixedLayout, convert to FlexibleLayout
+        if self._need_to_fix_layout(adjusted_choices, op_name):
+            layout = kernel_inputs.output_layout(flexible=False)
+            for ktc in adjusted_choices:
+                ktc.layout = layout
+                # for good measure, delete the cached ChoiceCaller from the ktc if it existed.
+                # ExternKernelChoice are cheap to generate
+                if hasattr(ktc, "_choice"):
+                    del ktc._choice
+        # Third pass: Convert to ChoiceCaller objects
+        return [ktc.choice for ktc in adjusted_choices if ktc.choice is not None]
 
     def triton_kernel_kwargs(
         self,
@@ -285,6 +337,34 @@ def should_use_persistent_reduction(
             ReductionHint.INNER: 1024,
         }.get(features.get_reduction_hint(), 64)
 
+        if features.get_reduction_hint() not in (
+            ReductionHint.INNER,
+            ReductionHint.OUTER_TINY,
+        ):
+            bounds = bound_sympy(features.reduction_numel)
+            lower = bounds.lower
+            upper = bounds.upper
+
+            if not all(
+                (
+                    (isinstance(bound, int) or bound.is_constant())
+                    and bound != torch.utils._sympy.numbers.IntInfinity()
+                )
+                for bound in (lower, upper)
+            ):
+                return False
+
+            lower = next_power_of_2(int(lower))
+            upper = next_power_of_2(int(upper))
+
+            # If we are are coalescing on xblock (not ReductionHint.INNER) and this is not a tiny kernel
+            # (not ReductionHint.OUTER_TINY), do not use persistent reduction if it induces tile
+            # quantization. Persistent reduction forces rblock == rnumel, if the bounds between lower
+            # and upper are large, for the lower values we will be masking off large % of read/writes,
+            # when we could expand the coalescing xblock instead.
+            if lower != upper:
+                return False
+
         if cooperative_reduction:
             # The RSPLIT of cooperative reductions means each thread block is operating on fewer elements
             try:
@@ -300,6 +380,7 @@ def should_use_persistent_reduction(
         # to pick the faster one.
         if config.triton.multi_kernel:
             threshold *= 16
+
         return V.graph.sizevars.statically_known_leq(
             features.reduction_numel, threshold
         )  # type: ignore[arg-types]
@@ -446,17 +527,6 @@ def can_fuse(
             WhyNoFuse(node1, node2)("Fusion will increase peak memory")
             return False
 
-        if (
-            config.realize_acc_reads_size_threshold is not None
-            and scheduler.fusion_accumulate_large_reads(
-                node1,
-                node2,
-                config.realize_acc_reads_size_threshold,
-            )
-        ):
-            WhyNoFuse(node1, node2)("Fusion accumulate large amount of reads")
-            return False
-
         return True
 
     @staticmethod
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 7b24208a2c512..dce60af39b364 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -390,7 +390,14 @@ def __enter__(self) -> _TemporaryFileWrapper[Any]:
 
     def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
         self.temp_file.close()
-        os.unlink(self.temp_file.name)
+        try:
+            os.unlink(self.temp_file.name)
+        except OSError as e:
+            if _IS_WINDOWS:
+                # On Windows, some case temp file is opened and fail to unlink. Need to ignore it.
+                pass
+            else:
+                raise e
 
 
 def write(
@@ -1245,7 +1252,10 @@ def cache_hit_post_compile(
         )
         trace_structured(
             "inductor_output_code",
-            lambda: {"filename": artifact_path},
+            lambda: {
+                "filename": artifact_path,
+                "file_path": os.path.abspath(artifact_path),
+            },
             payload_fn=lambda: code,
         )
         trace_structured(
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 9802358b02eee..ebea8e3a6339a 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -2036,6 +2036,7 @@ def __init__(
         self.stores = IndentedBuffer()
 
         self.num_load = 0
+        self.num_store = 0
         self.num_reduction = 0
 
         self.cse: CSE[CSEVariableType, Any] = CSE(self.newvar_prefix, self.suffix)
@@ -2407,8 +2408,9 @@ def get_dtype(name: str) -> torch.dtype:
 
         return get_dtype
 
-    def __init__(self, name: str) -> None:
+    def __init__(self, name: str, hash: Optional[str] = None) -> None:
         self.name = name
+        self._hash = hash
 
     @property
     def uid(self) -> str:
@@ -2421,6 +2423,17 @@ def uid(self) -> str:
         # TODO(coconutruben): add some central registration to assert on global uniqueness
         return self.name
 
+    @property
+    def src_hash(self) -> Union[str, None]:
+        """
+        source hash for a Template.
+
+        Templates can optionally provide a src hash to make it easier to cache/validate that
+        a template has not changed from one version to another. Override this if that detection
+        is different for your specific Template
+        """
+        return self._hash
+
     def choice_or_none(self, **kwargs: Any) -> Optional[ChoiceCaller]:
         """
         Maybe generates a new ChoiceCaller and returns it, or None if generation fails.
@@ -2689,6 +2702,7 @@ def store(
             self._update_store_cache(name, value)
         if name not in V.graph.removed_buffers:
             self.kernel.store(name, index, value, mode=mode)
+            self.kernel.num_store += 1
 
     def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable) -> None:
         self.kernel.store_buffer_names.add(name)
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 9d36e24d5f9e5..18103c9aab016 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -5390,42 +5390,48 @@ def codegen_sync(self):
 
     def define_kernel(self, src_code, nodes, kernel_args=None):
         wrapper = V.graph.wrapper_code
-        fused_name = (
-            get_fused_kernel_name(nodes, config.cpp.descriptive_names)
-            if config.cpp.descriptive_names
-            else ""
-        )
-        kernel_name = "_".join(["cpp", fused_name, wrapper.next_kernel_suffix()])
-        kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
-        src_code = src_code.replace(str(Placeholder.KERNEL_NAME), kernel_decl_name)
-        src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
-        # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
-        # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
-        src_code = src_code.replace("#pragma CMT", "//")
-
-        # Get the lines in the source code representing the function definition,
-        # excluding the the first line including cpp_prefix.h.
-        first_char = src_code.rfind('extern "C"')
-        last_char = src_code.find(")", first_char)
-        if _IS_WINDOWS:
-            # get_export_declaration introduced one more ')' in Windows
-            last_char = src_code.find(")", last_char + 1)
-        kernel_definition = f"{src_code[first_char : last_char + 1]};\n"
-
-        compile_wrapper = IndentedBuffer()
-        args = self.kernel_group.args if kernel_args is None else kernel_args
-        _, _, arg_types = args.cpp_argdefs()
-        if not V.graph.cpp_wrapper:
-            compile_wrapper.writeline(f"async_compile.cpp_pybinding({arg_types!r}, '''")
-        compile_wrapper.splice(src_code, strip=True)
-        if not V.graph.cpp_wrapper:
-            compile_wrapper.writeline("''')")
-        wrapper.define_kernel(
-            kernel_name,
-            compile_wrapper.getvalue(),
-            gpu=False,
-            cpp_definition=kernel_definition,
-        )
+        if src_code in wrapper.src_to_kernel:
+            kernel_name = wrapper.src_to_kernel[src_code]
+        else:
+            fused_name = (
+                get_fused_kernel_name(nodes, config.cpp.descriptive_names)
+                if config.cpp.descriptive_names
+                else ""
+            )
+            kernel_name = "_".join(["cpp", fused_name, wrapper.next_kernel_suffix()])
+            wrapper.src_to_kernel[src_code] = kernel_name
+            kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
+            src_code = src_code.replace(str(Placeholder.KERNEL_NAME), kernel_decl_name)
+            src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
+            # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
+            # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
+            src_code = src_code.replace("#pragma CMT", "//")
+
+            # Get the lines in the source code representing the function definition,
+            # excluding the first line including cpp_prefix.h.
+            first_char = src_code.rfind('extern "C"')
+            last_char = src_code.find(")", first_char)
+            if _IS_WINDOWS:
+                # get_export_declaration introduced one more ')' in Windows
+                last_char = src_code.find(")", last_char + 1)
+            kernel_definition = f"{src_code[first_char : last_char + 1]};\n"
+
+            compile_wrapper = IndentedBuffer()
+            args = self.kernel_group.args if kernel_args is None else kernel_args
+            _, _, arg_types = args.cpp_argdefs()
+            if not V.graph.cpp_wrapper:
+                compile_wrapper.writeline(
+                    f"async_compile.cpp_pybinding({arg_types!r}, r'''"
+                )
+            compile_wrapper.splice(src_code, strip=True)
+            if not V.graph.cpp_wrapper:
+                compile_wrapper.writeline("''')")
+            wrapper.define_kernel(
+                kernel_name,
+                compile_wrapper.getvalue(),
+                gpu=False,
+                cpp_definition=kernel_definition,
+            )
         return kernel_name
 
     def flush(self):
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
index bfcebbd6a3810..6dbf1c8ad69ed 100644
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -809,13 +809,27 @@ def get_num_byte(dtype):
             if (
                 config.cpp.use_small_dequant_buffer
                 and dtype_A is torch.bfloat16
-                and dtype_B is torch.uint8
                 and Mt_blocks == 1
             ):
-                # Make a small dequant_B buffer for woq int4 [q_group_size, Nr]
-                # Since when Mt_blocks == 1, L1-reside B block can't be reused by A.
-                if Kc_blocks * Kr >= self.q_group_size():
-                    Kc_blocks = self.q_group_size() // Kr
+                if dtype_B is torch.uint8:
+                    # A16W4
+                    # Make a small dequant_B buffer for woq int4 [q_group_size, Nr]
+                    # Since when Mt_blocks == 1, L1-reside B block can't be reused by A.
+                    if Kc_blocks * Kr >= self.q_group_size():
+                        Kc_blocks = self.q_group_size() // Kr
+
+                elif dtype_B is torch.int8:
+                    # A16W8
+                    # Make A, B, C buffer in L1
+                    A_buf_size_div_K = self.m * num_byte_A
+                    B_buf_size_div_K = Nr * num_byte_B
+                    # assume acc in float32/int32 and Mc_blocks = Nc_blocks = 1
+                    C_buf_size = Mr * Nr * 4
+                    K_block_size = (L1 - C_buf_size) // (
+                        A_buf_size_div_K + B_buf_size_div_K
+                    )
+                    if Kc_blocks * Kr >= K_block_size:
+                        Kc_blocks = (K_block_size + Kr - 1) // Kr
 
             # Step 2: Decide Mc assuming A block is L2-reside.
             min_Mc_ratio = 2  # TODO(jgong5): something to tune?
diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py
index d6b8806bdd910..2614946989b1f 100644
--- a/torch/_inductor/codegen/cpp_micro_gemm.py
+++ b/torch/_inductor/codegen/cpp_micro_gemm.py
@@ -1049,17 +1049,16 @@ class CppMicroGemmAMX(CppMicroGemm):
         {{input2_t}}* base_addr = const_cast<{{input2_t}}*>(B) + base_idx;
         for (int idx_dq = 0, idx_q = 0; idx_dq < buf_size; idx_q += ldb, idx_dq += {{block_n}}) {
         {%- for vec_idx in range(0, block_n, 32) %}
+            _mm_prefetch(base_addr + idx_q + 64 * ldb, _MM_HINT_T0);
             {%- if (block_n - vec_idx) >= 32 %}
             // 1) Load 32 x int8
             __m256i v8  = _mm256_loadu_si256((const __m256i*)(base_addr + idx_q + {{vec_idx}}));
-            // 2) Widen: 32 x i8 -> 32 x i16
-            __m512i v16 = _mm512_cvtepi8_epi16(v8);  // sign-extend. Use _mm512_cvtepu8_epi16 for unsigned
-            // Split the 32 x i16 into two 16-lane halves
-            __m256i v16_lo = _mm512_castsi512_si256(v16);
-            __m256i v16_hi = _mm512_extracti64x4_epi64(v16, 1);
+            // 2) Extract two halves
+            __m128i v8_lo = _mm256_extracti128_si256(v8, 0);
+            __m128i v8_hi = _mm256_extracti128_si256(v8, 1);
             // 3) Widen each half to i32
-            __m512i v32_lo = _mm512_cvtepi16_epi32(v16_lo);
-            __m512i v32_hi = _mm512_cvtepi16_epi32(v16_hi);
+            __m512i v32_lo = _mm512_cvtepi8_epi32(v8_lo);
+            __m512i v32_hi = _mm512_cvtepi8_epi32(v8_hi);
             // 4) Convert to f32
             __m512 f_lo = _mm512_cvtepi32_ps(v32_lo);
             __m512 f_hi = _mm512_cvtepi32_ps(v32_hi);
@@ -1071,16 +1070,13 @@ class CppMicroGemmAMX(CppMicroGemm):
             {%- elif (block_n - vec_idx) >= 16 %}
             // 1) Load 16 x int8 (128 bits)
             __m128i v8 = _mm_loadu_si128((const __m128i*)(base_addr + idx_q + {{vec_idx}}));
-            // 2) Widen: 16 x i8 -> 16 x i16
-            __m256i v16 = _mm256_cvtepi8_epi16(v8);   // for signed
-            // use _mm256_cvtepu8_epi16 for unsigned
-            // 3) Widen further: 16 x i16 -> 16 x i32
-            __m512i v32 = _mm512_cvtepi16_epi32(v16);
-            // 4) Convert to f32
+            // 2) Widen: 16 x i8 -> 16 x i32
+            __m512i v32 = _mm512_cvtepi8_epi32(v8);
+            // 3) Convert to f32
             __m512 f32 = _mm512_cvtepi32_ps(v32);
-            // 5) Convert f32 -> bf16 (round-to-nearest-even)
+            // 4) Convert f32 -> bf16 (round-to-nearest-even)
             __m256i bf16 = (__m256i)_mm512_cvtneps_pbh(f32);
-            // 6) Store 16 x bf16 (256 bits)
+            // 5) Store 16 x bf16 (256 bits)
             _mm256_storeu_si256((__m256i*)(dequantized_B_buf + idx_dq + {{vec_idx}}), bf16);
             {%- else %}
             auto b_int8_tail = at::vec::Vectorized<int8_t>::loadu(
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
index 929c227039463..a2d9878f22235 100644
--- a/torch/_inductor/codegen/cpp_utils.py
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -429,7 +429,7 @@ def localize_nodes(
         `local_buf`. This helps the fused loops to work on smaller-sized local buffers
         for better data locality.
 
-        The the data access of `local_buf` is assumed to be contiguous with the
+        The data access of `local_buf` is assumed to be contiguous with the
         same order as the `global_buf`.
         """
         assert len(nodes) > 0
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 83d1d0614674b..759eb3da462c6 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -1364,9 +1364,10 @@ def generate_c_shim_fallback_kernel(
 
         debug_handle = None
         if config.trace.provenance_tracking_level != 0:
+            shim_fn = self.get_c_shim_func_name(fallback_kernel.cpp_kernel_name, device)  # type: ignore[arg-type]
             debug_handle = set_kernel_post_grad_provenance_tracing(
                 fallback_kernel,
-                fallback_kernel.cpp_kernel_name,  # type: ignore[arg-type]
+                shim_fn,
                 is_extern=True,
             )
         self.generate_c_shim_extern_kernel_call(
@@ -1398,7 +1399,15 @@ def _generate_extern_kernel_out_helper(
             kernel, args, device, debug_handle=debug_handle
         )
 
-    def generate_scatter_fallback(
+    def _get_scatter_reduce_enum(self, reduce):
+        # Follow aten/src/ATen/native/ReductionType.h:get_operator_enum
+        get_operator_enum = {"add": "sum", "multiply": "prod"}
+        if reduce in get_operator_enum:
+            reduce = get_operator_enum[reduce]
+
+        return reduce
+
+    def _generate_scatter_fallback(
         self,
         output,
         inputs,
@@ -1408,6 +1417,8 @@ def generate_scatter_fallback(
         reduce,
         kwargs,
     ):
+        reduce = self._get_scatter_reduce_enum(reduce)
+
         # call the ABI shim function instead of the ATen one
         cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name, self.device)
         # TODO: consider remove "_out" and add missing inplace variants to fallback_ops.py
@@ -1428,7 +1439,7 @@ def generate_scatter_fallback(
         line += ");"
         self.writeline(line)
 
-    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+    def _generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
         # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version
         # See the comment in codegen_reinterpret_view about why having something like
         # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the corresponding
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
index 63c5bc2debe8b..9749d09a1af20 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
@@ -694,7 +694,12 @@ def generate_c_shim_extern_kernel_call(
             kernel, wrapped_args, device, debug_args=args
         )
 
-    def generate_scatter_fallback(
+    def generate_scatter_fallback(self, node: ir.ScatterFallback):
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+        super().generate_scatter_fallback(node)
+
+    def _generate_scatter_fallback(
         self,
         output,
         inputs,
@@ -704,8 +709,7 @@ def generate_scatter_fallback(
         reduce,
         kwargs,
     ):
-        # No stack allocation when there is a fallback op
-        self.allow_stack_allocation = False
+        reduce = self._get_scatter_reduce_enum(reduce)
 
         # call the ABI shim function instead of the ATen one
         cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name, self.device)
@@ -731,10 +735,12 @@ def generate_scatter_fallback(
         line += ");"
         self.writeline(line)
 
-    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+    def generate_index_put_fallback(self, node: ir.IndexPutFallback) -> None:
         # No stack allocation when there is a fallback op
         self.allow_stack_allocation = False
+        super().generate_index_put_fallback(node)
 
+    def _generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
         self._assert_safe_to_use_borrow_arrayref_tensor_as_tensor()
         # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version
         # See the comment in codegen_reinterpret_view about why having something like
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index 6bbbab8599008..542962f58b089 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -3,6 +3,7 @@
 
 import dataclasses
 import re
+import sys
 from itertools import count, zip_longest
 from typing import Any, Optional, Union
 from typing_extensions import Self
@@ -199,6 +200,11 @@ def generate_load_kernel(self, prefix, kernel_var_name, params):
         prefix.writeline("}")
 
     def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
+        """
+        Generate the GPU kernel launching code.
+        This is where all the call args being sorted out and generated.
+        If enable_kernel_profile is enabled, all args related information would be packed in this function.
+        """
         triton_meta = params["triton_meta"]
         assert len(self.arg_types) == len(params["def_args"]), (
             self.arg_types,
@@ -234,7 +240,57 @@ def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
             "kernel_args_",
             "stream_",
         ]
-        prefix.writeline(f"launchKernel({', '.join(launch_kernel_args)});")
+        if wrapper.device == "xpu":
+            launch_kernel_args.append(str(params["threads_per_warp"]))
+
+        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
+            "linux",
+            "win32",
+        ]
+        if enable_kernel_profile:
+            normalized_kernel_name = re.sub(r"[^a-zA-Z0-9_]", "_", f"{kernel_var_name}")
+            prefix.writeline("{")
+            with prefix.indent():
+                prefix.writelines(
+                    [
+                        f"std::unordered_map<std::string, C10IValueHandle> kwargs_{normalized_kernel_name};",
+                        "",
+                    ]
+                )
+                record_launch_kernel_args = [
+                    ("grid_0", "grid_0"),
+                    ("grid_1", "grid_1"),
+                    ("grid_2", "grid_2"),
+                    ("num_warps", str(params["num_warps"])),
+                    ("shared_mem", str(params["shared_mem"])),
+                ]
+                for k, v in record_launch_kernel_args:
+                    arg_name = f"{normalized_kernel_name}_{k}"
+                    prefix.writelines(
+                        [
+                            f"// Create c10::IValue for {k}",
+                            f"C10IValueHandle tmp_{arg_name};",
+                            f"aoti_torch_int64_to_ivalue({v}, &tmp_{arg_name});",
+                            f"RAIIC10IValueHandle RAII_{arg_name}(tmp_{arg_name});",
+                            f'kwargs_{normalized_kernel_name}.emplace("{k}", RAII_{arg_name});',
+                        ]
+                    )
+
+                prefix.writelines(
+                    [
+                        "",
+                        (
+                            "torch::aot_inductor::RAIIAtenRecordFunctionHandle "
+                            f"record_{normalized_kernel_name}_"
+                            f'("{kernel_var_name}", reinterpret_cast<IValueMapHandle>(&kwargs_{normalized_kernel_name}));'
+                        ),
+                        "",
+                        f"launchKernel({', '.join(launch_kernel_args)});",
+                    ]
+                )
+            prefix.writeline("}")
+        else:
+            prefix.writeline(f"launchKernel({', '.join(launch_kernel_args)});")
 
 
 class CppWrapperGpu(CppWrapperCpu):
diff --git a/torch/_inductor/codegen/cpu_device_op_overrides.py b/torch/_inductor/codegen/cpu_device_op_overrides.py
index 1ffafa74dd687..ccada837abbd4 100644
--- a/torch/_inductor/codegen/cpu_device_op_overrides.py
+++ b/torch/_inductor/codegen/cpu_device_op_overrides.py
@@ -14,6 +14,9 @@ def get_raw_stream(_):
             """
         )
 
+    def cpp_kernel_type(self) -> str:
+        return "void*"
+
     def set_device(self, device_idx: int) -> str:
         return "pass"
 
diff --git a/torch/_inductor/codegen/cuda/cuda_env.py b/torch/_inductor/codegen/cuda/cuda_env.py
index a11462fc8a0b8..3eb65273285ea 100644
--- a/torch/_inductor/codegen/cuda/cuda_env.py
+++ b/torch/_inductor/codegen/cuda/cuda_env.py
@@ -27,6 +27,16 @@ def get_cuda_arch() -> Optional[str]:
         return None
 
 
+@clear_on_fresh_cache
+@functools.lru_cache(1)
+def is_datacenter_blackwell_arch() -> bool:
+    arch = get_cuda_arch()
+    if arch is None:
+        return False
+    arch_number = int(arch)
+    return arch_number >= 100 and arch_number < 110
+
+
 @clear_on_fresh_cache
 @functools.lru_cache(1)
 def get_cuda_version() -> Optional[str]:
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
index 605b93dff5926..a2beb9ecfc426 100644
--- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
@@ -38,7 +38,7 @@
     if config.is_fbcode():
         import python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
     else:
-        import cutlass as python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
+        import cutlass_cppgen as python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
 
     from torch._inductor.codegen.cuda import cuda_env
     from torch._inductor.utils import IndentedBuffer
@@ -174,7 +174,7 @@ def _render_argument_type(
         def is_nested_visitor_type(t: type) -> bool:
             return ".".join([t.__module__, t.__qualname__]) in {
                 "python_cutlass.backend.c_types.visitor_factory.<locals>.VisitorType",
-                "cutlass.backend.c_types.visitor_factory.<locals>.VisitorType",
+                "cutlass_cppgen.backend.c_types.visitor_factory.<locals>.VisitorType",
             }
 
         buffer = IndentedBuffer()
@@ -235,7 +235,7 @@ def _get_arg_from_node(
         # Once again, need to check for local class type for stride tuple
         if str(arg_ty) in {
             "<class 'python_cutlass.backend.c_types.tuple_factory_.<locals>.TupleType'>",
-            "<class 'cutlass.backend.c_types.tuple_factory_.<locals>.TupleType'>",
+            "<class 'cutlass_cppgen.backend.c_types.tuple_factory_.<locals>.TupleType'>",
         }:
             DEFAULT_STRIDE_LEN = 3
             assert len(node.get_layout().stride) <= DEFAULT_STRIDE_LEN
diff --git a/torch/_inductor/codegen/cuda/cutlass_python_evt.py b/torch/_inductor/codegen/cuda/cutlass_python_evt.py
index ca5e6031b19cd..72108b29b3cb0 100644
--- a/torch/_inductor/codegen/cuda/cutlass_python_evt.py
+++ b/torch/_inductor/codegen/cuda/cutlass_python_evt.py
@@ -88,7 +88,7 @@ def relu(x0: str) -> str:
 
     @staticmethod
     def sigmoid(x0: str) -> str:
-        raise NotImplementedError("sigmoid is not supported in CUTLASS python evt")
+        return CutlassEVTOpsMixIn._prefix_un_op("sigmoid", x0)
 
     @staticmethod
     def sub(x0: str, x1: str) -> str:
@@ -96,7 +96,11 @@ def sub(x0: str, x1: str) -> str:
 
     @staticmethod
     def tanh(x0: str) -> str:
-        raise NotImplementedError("tanh is not supported in CUTLASS python evt")
+        return CutlassEVTOpsMixIn._prefix_un_op("tanh", x0)
+
+    @staticmethod
+    def exp(x0: str) -> str:
+        return CutlassEVTOpsMixIn._prefix_un_op("exp", x0)
 
 
 class MockCutlassHandler(CutlassEVTOpsMixIn, WrapperHandler):
diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
index 7ca33ea779cc7..bdbbbe58a3b19 100644
--- a/torch/_inductor/codegen/cuda/cutlass_utils.py
+++ b/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -43,7 +43,7 @@ def move_cutlass_compiled_cache() -> None:
     if config.is_fbcode():
         import python_cutlass  # type: ignore[import-not-found]
     else:
-        import cutlass as python_cutlass  # type: ignore[import-not-found]  # noqa: F401
+        import cutlass_cppgen as python_cutlass  # type: ignore[import-not-found]  # noqa: F401
 
     # Check if the CACHE_FILE attribute exists in python_cutlass and if the file exists
     if not hasattr(python_cutlass, "CACHE_FILE") or not os.path.exists(
@@ -118,7 +118,7 @@ def path_join(path0, path1):
     tmp_cutlass_full_path = os.path.abspath(os.path.join(cache_dir(), "torch_cutlass"))
 
     dst_link_library = path_join(tmp_cutlass_full_path, "cutlass_library")
-    dst_link_cutlass = path_join(tmp_cutlass_full_path, "cutlass")
+    dst_link_cutlass = path_join(tmp_cutlass_full_path, "cutlass_cppgen")
     dst_link_pycute = path_join(tmp_cutlass_full_path, "pycute")
 
     # mock modules to import cutlass
@@ -156,7 +156,7 @@ def link_and_append(dst_link, src_path, parent_dir):
                 )
 
         try:
-            import cutlass  # noqa: F401, F811
+            import cutlass_cppgen  # noqa: F401, F811
             import cutlass_library.generator  # noqa: F401
             import cutlass_library.library  # noqa: F401
             import cutlass_library.manifest  # noqa: F401
diff --git a/torch/_inductor/codegen/multi_kernel.py b/torch/_inductor/codegen/multi_kernel.py
index c7ac48ba0231c..01055f5cd6e50 100644
--- a/torch/_inductor/codegen/multi_kernel.py
+++ b/torch/_inductor/codegen/multi_kernel.py
@@ -1,8 +1,10 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
+import math
 import os
 import pathlib
+from typing import Any, Optional, Union
 
 from torch._inductor.ir import MultiTemplateBuffer
 from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
@@ -31,7 +33,13 @@ def __init__(self):
         self.subkernel_to_kernel_name = {}
         self.kernel_defs = IndentedBuffer()
 
-    def define_kernel(self, kernels):
+    def define_kernel(
+        self,
+        kernels: list[Any],
+        kernel_shape_keys: Optional[
+            list[Union[None, tuple[tuple[int, ...], ...]]]
+        ] = None,
+    ) -> str:
         """
         Previously we name the multi kernel as "multi_kernel_{kernel_names[0]}".
         This has some minor issue.
@@ -45,6 +53,12 @@ def define_kernel(self, kernels):
         The only different is cache eviction policy.
 
         We should name the multi-kernel differently in these 2 cases.
+
+        kernels:
+            A list of kernels
+        kernel_shape_keys:
+            Specified for size-hint multi-kernels.
+            Each list element is a shape key, corresponding to the concrete input & output size hints each kernel was tuned for.
         """
         # Prevent circular import
         from ..select_algorithm import TritonTemplateKernel
@@ -68,9 +82,7 @@ def define_kernel(self, kernels):
             kernels[0].output_node, MultiTemplateBuffer
         ):
             for i, kernel in enumerate(kernels):
-                additional_call_args, additional_arg_types = (
-                    kernel.additional_call_args_and_types()
-                )
+                additional_call_args, _ = kernel.additional_call_args_and_types()
                 if i not in arg_index:
                     arg_index[i] = []
                 arg_index[i].append(slice(0, len(call_args)))
@@ -85,7 +97,7 @@ def define_kernel(self, kernels):
             for i in range(len(kernels)):
                 arg_index[i] = [slice(0, len(call_args))]
 
-        shape_specialize = isinstance(kernels[0], TritonTemplateKernel)
+        keyed_by_sizes = kernel_shape_keys is not None
         buf = self.kernel_defs
         buf.writeline("")
         buf.writeline("arg_index = {")
@@ -93,13 +105,26 @@ def define_kernel(self, kernels):
             slice_reprs = ", ".join(repr(s) for s in slice_list)
             buf.writeline(f"    {key}: [{slice_reprs}],")
         buf.writeline("}")
-        buf.writeline(
-            f"{multi_kernel_name} = async_compile.multi_kernel({multi_kernel_name!r}, ["
-        )
-        with buf.indent():
-            for name in kernel_names:
-                buf.writeline(f"{name},")
-        buf.writeline(f"], arg_index=arg_index, shape_specialize={shape_specialize})")
+
+        if not keyed_by_sizes:  # no size hint keys, just call with list of kernels
+            buf.writeline(
+                f"{multi_kernel_name} = async_compile.multi_kernel({multi_kernel_name!r}, ["
+            )
+            with buf.indent():
+                for name in kernel_names:
+                    buf.writeline(f"{name},")
+            buf.writeline("], arg_index=arg_index)")
+        else:  # call with dict[size hint key, kernel]
+            assert isinstance(kernels[0], TritonTemplateKernel)
+            assert isinstance(kernel_shape_keys, list)
+            assert len(kernels) == len(kernel_shape_keys)
+            buf.writeline(
+                f"{multi_kernel_name} = async_compile.size_hint_multi_kernel({multi_kernel_name!r}, {{"
+            )
+            with buf.indent():
+                for shape_key, name in zip(kernel_shape_keys, kernel_names):
+                    buf.writeline(f"{shape_key}: {name},")
+            buf.writeline("}, arg_index=arg_index)")
 
         if config.triton.autotune_at_compile_time:
             V.graph.wrapper_code.src_to_kernel["\n".join(kernel_names)] = (
@@ -266,8 +291,8 @@ class MultiKernelCall:
     This class is called at run time to actually run the kernel
     """
 
-    def __init__(self, multi_kernel_name, kernels, arg_index, shape_specialize=False):
-        assert len(kernels) >= 2
+    def __init__(self, multi_kernel_name, kernels, arg_index):
+        assert len(kernels) >= 1
         self._kernels = kernels
         self.multi_kernel_name = multi_kernel_name
 
@@ -287,13 +312,6 @@ def __init__(self, multi_kernel_name, kernels, arg_index, shape_specialize=False
 
         self._recorded = False
 
-        # This means for each unique shape we will do a separate assessment
-        # for which kernel is the best. This is particularly useful for matmul
-        # kernels where the best kernel can vary based on very small differences
-        # in shape.
-        self._shape_specialize = shape_specialize
-        self._shape_cache = {}
-
     def cache_file_path(self):
         key = code_hash(
             ",".join(
@@ -415,20 +433,6 @@ def lookup_choice(multi_kernel_name: str) -> str:
         return V.graph.multi_kernel_to_choice[multi_kernel_name]
 
     def run(self, *args, **kwargs):
-        if self._shape_specialize:
-            cache_key = self._get_shape_cache_key(*args, **kwargs)
-            cached_choice = self._get_cached_shape_choice(cache_key)
-            if cached_choice is not None:
-                self.picked_kernel = cached_choice
-                log.debug(
-                    "using cached shape-specialized choice %dth sub-kernel in %s. Cache key: %s",
-                    self.picked_kernel,
-                    [k.inductor_meta.get("kernel_name") for k in self.kernels],
-                    cache_key,
-                )
-            else:
-                self._select_kernel_by_shape(*args, **kwargs)
-
         if self.picked_kernel is None:
             timings = self.benchmark_sub_kernels(*args, **kwargs)
             self.picked_kernel = timings.index(min(timings))
@@ -460,6 +464,68 @@ def run(self, *args, **kwargs):
         filtered_args = self._get_filtered_args(args, self.picked_kernel)
         run(*filtered_args, **kwargs)
 
+    def _metrics_table_row(self, timings):
+        def get_kernel_path(k):
+            return k.fn.fn.__code__.co_filename
+
+        k0 = self.kernels[0]
+        row = {
+            "size_hints": k0.size_hints,
+            "reduction_hint": k0.inductor_meta.get("reduction_hint"),
+        }
+        max_kernels = 4
+        assert len(timings) <= max_kernels
+        for i in range(max_kernels):
+            if i < len(self.kernels):
+                row[f"kernel{i}_path"] = get_kernel_path(self.kernels[i])
+                row[f"kernel{i}_latency"] = timings[i]
+            else:
+                row[f"kernel{i}_path"] = ""
+                row[f"kernel{i}_latency"] = ""
+        return row
+
+
+class SizeHintMultiKernel(MultiKernel):
+    """
+    Version of multi-kernel that generates kernels based on specified size hints.
+    Currently only performs 1-d search over hints; doesn't perform combinatorial n-d search
+    if n > 1 dynamic dimensions are specified.
+
+    e.g. matmul([s0, s1], [s1, s2]) with size-hints [64, 256] only generates 2 kernels,
+    based on tuning shapes ([64, 64], [64, 64]) and ([256, 256], [256, 256])
+    """
+
+    def __init__(self, kernels):
+        assert isinstance(kernels, dict) and len(kernels) >= 1
+
+        self.kernels, self.kernel_shape_keys = [], []
+        for shape_key, kernel in kernels.items():
+            self.kernels.append(kernel)
+            self.kernel_shape_keys.append(shape_key)
+        self.kernel_name = V.graph.wrapper_code.multi_kernel_state.define_kernel(
+            self.kernels, self.kernel_shape_keys
+        )
+
+        # need this since some code in inductor check if the kernel object has an args
+        # attribute to decide if it's a non-null kernel.
+        self.args = object()
+
+
+class SizeHintMultiKernelCall(MultiKernelCall):
+    """
+    Runtime class for size-hint multi-kernels.
+    Instead of having a plain list of kernels to benchmark over, keys them by input & output shapes,
+    and optionally perform shape-based selection. The pre-generated kernel is chosen based on the shape keys,
+    with the heuristic being log2 l1 distance between the pre-generated / runtime input & output shapes.
+    """
+
+    def __init__(self, multi_kernel_name, kernels, arg_index):
+        super().__init__(multi_kernel_name, list(kernels.values()), arg_index)
+        self._kernel_hints = list(kernels.keys())
+
+        # Caches results for unique shapes.
+        self._shape_cache = {}
+
     def _get_shape_cache_key(self, *args, **kwargs):
         """
         Generate a cache key based on tensor shapes for shape-specialized dispatch.
@@ -478,36 +544,60 @@ def _get_cached_shape_choice(self, cache_key):
 
     def _cache_shape_choice(self, cache_key, kernel_idx):
         """
-        Cache kernel choice for a specific shape
+        Cache kernel choice for a specific shape.
         """
         self._shape_cache[cache_key] = kernel_idx
 
+    def _dist_heuristic(self, k1, k2):
+        """
+        log2 L1 distance heuristic for kernel selection.
+        """
+
+        def dist(x, y):
+            lx = math.log2(x) if x > 0 else -1
+            ly = math.log2(y) if y > 0 else -1
+            return abs(lx - ly)
+
+        out = 0
+        for s1, s2 in zip(k1, k2):
+            out += sum(dist(x, y) for x, y in zip(s1, s2))
+        return out
+
+    def run(self, *args, **kwargs):
+        cache_key = self._get_shape_cache_key(*args, **kwargs)
+        cached_choice = self._get_cached_shape_choice(cache_key)
+        if cached_choice is not None:
+            self.picked_kernel = cached_choice
+            log.debug(
+                "using cached shape-specialized choice %dth sub-kernel in %s. Cache key: %s",
+                self.picked_kernel,
+                [k.inductor_meta.get("kernel_name") for k in self.kernels],
+                cache_key,
+            )
+        else:
+            self._select_kernel_by_shape(*args, **kwargs)
+
+        if not self._recorded:
+            self._recorded = True
+            picked_kernel_name = self.kernels[self.picked_kernel].inductor_meta.get(
+                "kernel_name"
+            )
+            assert picked_kernel_name is not None
+            self.record_choice(self.multi_kernel_name, picked_kernel_name)
+
+        run = self.kernels[self.picked_kernel].run  # type: ignore[method-assign]
+        filtered_args = self._get_filtered_args(args, self.picked_kernel)
+        run(*filtered_args, **kwargs)
+
     def _select_kernel_by_shape(self, *args, **kwargs):
         """
         Benchmark kernels for a particular shape and return the
         best kernel for this shape.
         """
         shape_key = self._get_shape_cache_key(*args, **kwargs)
-        timings = self.benchmark_sub_kernels(*args, **kwargs)
-        self.picked_kernel = timings.index(min(timings))
+        dists = [
+            self._dist_heuristic(shape_key, key) if key is not None else 2**62
+            for key in self._kernel_hints
+        ]
+        self.picked_kernel = dists.index(min(dists))
         self._cache_shape_choice(shape_key, self.picked_kernel)
-
-    def _metrics_table_row(self, timings):
-        def get_kernel_path(k):
-            return k.fn.fn.__code__.co_filename
-
-        k0 = self.kernels[0]
-        row = {
-            "size_hints": k0.size_hints,
-            "reduction_hint": k0.inductor_meta.get("reduction_hint"),
-        }
-        max_kernels = 4
-        assert len(timings) <= max_kernels
-        for i in range(max_kernels):
-            if i < len(self.kernels):
-                row[f"kernel{i}_path"] = get_kernel_path(self.kernels[i])
-                row[f"kernel{i}_latency"] = timings[i]
-            else:
-                row[f"kernel{i}_path"] = ""
-                row[f"kernel{i}_latency"] = ""
-        return row
diff --git a/torch/_inductor/codegen/rocm/compile_command.py b/torch/_inductor/codegen/rocm/compile_command.py
index b9cae55102b61..aa935b14af23c 100644
--- a/torch/_inductor/codegen/rocm/compile_command.py
+++ b/torch/_inductor/codegen/rocm/compile_command.py
@@ -4,7 +4,7 @@
 from typing import Optional
 
 from torch._inductor import config
-from torch._inductor.utils import is_linux
+from torch._inductor.utils import is_linux, try_import_ck_lib
 
 
 log = logging.getLogger(__name__)
@@ -18,18 +18,23 @@ def _rocm_include_paths(dst_file_ext: str) -> list[str]:
         if config.rocm.rocm_home
         else cpp_extension._join_rocm_home("include")
     )
-    if not config.rocm.ck_dir:
-        log.warning("Unspecified Composable Kernel include dir")
 
     if config.is_fbcode():
         from libfb.py import parutil
 
         ck_path = parutil.get_dir_path("composable-kernel-headers")
     else:
+        if not config.rocm.ck_dir:
+            ck_dir, _, _, _ = try_import_ck_lib()
+            if not ck_dir:
+                log.warning("Unspecified Composable Kernel directory")
+            config.rocm.ck_dir = ck_dir
         ck_path = config.rocm.ck_dir or cpp_extension._join_rocm_home(
             "composable_kernel"
         )
 
+    log.debug("Using ck path %s", ck_path)
+
     ck_include = os.path.join(ck_path, "include")
     ck_library_include = os.path.join(ck_path, "library", "include")
 
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index d73db7ed2a227..6bf46c21f94dd 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -11,7 +11,7 @@
 import operator
 import textwrap
 from collections import Counter
-from typing import Any, Callable, Generic, no_type_check, Optional, TYPE_CHECKING, Union
+from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, Union
 from typing_extensions import TypeVar
 
 import sympy
@@ -46,7 +46,7 @@
 from ..runtime.runtime_utils import green_text, yellow_text
 from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
 from ..utils import (
-    cache_on_self,
+    cache_property_on_self,
     expr_fits_within_32bit,
     get_dtype_size,
     IndentedBuffer,
@@ -60,7 +60,7 @@
 from ..virtualized import ops, OpsWrapper, V
 from .block_analysis import BlockPatternMatcher
 from .common import CSEVariable, index_prevent_reordering, Kernel, PythonPrinter
-from .multi_kernel import MultiKernel
+from .multi_kernel import MultiKernel, SizeHintMultiKernel
 from .simd_kernel_features import (
     DisableReduction,
     EnableReduction,
@@ -133,8 +133,7 @@ def __init__(
         self.root = root
 
     @property
-    @cache_on_self
-    @no_type_check  # https://github.com/python/mypy/issues/17184
+    @cache_property_on_self
     def is_reduction(self) -> bool:
         return prefix_is_reduction(self.prefix)
 
@@ -142,8 +141,7 @@ def symbol(self) -> sympy.Symbol:
         return sympy_index_symbol(self.name)
 
     @property
-    @cache_on_self
-    @no_type_check
+    @cache_property_on_self
     def symt(self) -> SymT:
         prefix_to_symt = {prefix: symt for symt, prefix in prefix_str.items()}
         return prefix_to_symt[self.prefix]
@@ -416,6 +414,8 @@ def __init__(
         )
         self.no_x_dim = self.want_no_x_dim()
         self.code_hash: Optional[str] = None
+        # Info to enable multiple store_output calls for epilogue subtiling
+        self.store_output_ctr = itertools.count()
 
         # define this in a closure to make cache local to object
         @functools.cache
@@ -429,9 +429,16 @@ def simplify_indexing(index: sympy.Expr):
         self.simplify_indexing = simplify_indexing
         self.initialize_range_tree(pid_cache)
 
+    def _get_store_output_subgraph_name(self, i: int) -> str:
+        return f"<STORE_OUTPUT_{i}>"
+
+    def get_store_output_count(self):
+        total = next(self.store_output_ctr)
+        self.store_output_ctr = itertools.count(start=total - 1, step=1)
+        return total
+
     @property
-    @cache_on_self
-    @no_type_check  # https://github.com/python/mypy/issues/17184
+    @cache_property_on_self
     def num_reduction_dims(self) -> int:
         return sum(prefix_is_reduction(prefix) for prefix in self.numels)
 
@@ -1010,7 +1017,10 @@ def estimate_kernel_num_bytes(self):
         # for the "cat". However, I think it might be a bit overwhelming that
         # we add such complexity only for handling some particular cases for
         # benchmarking.
-        out_numel = V.graph.sizevars.size_hint(sympy_product(self.numels.values()))
+        out_numel = V.graph.sizevars.size_hint(
+            sympy_product(self.numels.values()),
+            fallback=config.unbacked_symint_fallback,
+        )
         for i, arg in enumerate(call_args):
             # "buf" may be narrowed. In this case, the number of memory accesses
             # should be estimated based on the reinterpreted layout.
@@ -1021,7 +1031,9 @@ def estimate_kernel_num_bytes(self):
                 nbytes.append(0)
                 continue
             arg_numel = V.graph.get_numel(arg)
-            buf_size = V.graph.sizevars.size_hint(arg_numel)
+            buf_size = V.graph.sizevars.size_hint(
+                arg_numel, fallback=config.unbacked_symint_fallback
+            )
             if buf_size > out_numel:
                 # This arg points to a buf that has been sliced.
                 # We need to count each individual slice to have
@@ -1603,10 +1615,13 @@ def _codegen_single_template(
 
             partial_code = render()
 
-            with kernel.set_subgraph_body("<STORE_OUTPUT>"):
-                for node in epilogue_nodes:
-                    node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
-                kernel.cse.invalidate(OrderedSet())
+            num_store_subgraphs = kernel.get_store_output_count()
+            for i in range(num_store_subgraphs):
+                subgraph_name = kernel._get_store_output_subgraph_name(i)
+                with kernel.set_subgraph_body(subgraph_name):
+                    for node in epilogue_nodes:
+                        node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
+                    kernel.cse.invalidate(OrderedSet())
 
             for input_name, buffer in kernel.named_input_nodes.items():
                 subgraph_name = f"<LOAD_INPUT_{input_name}>"
@@ -1654,9 +1669,10 @@ def _codegen_single_template(
                 subgraph_name = f"<LOAD_INPUT_{input_name}>"
                 partial_code.finalize_hook(subgraph_name, strict=False)
 
-            with kernel.set_subgraph_body("<STORE_OUTPUT>"):
-                if not isinstance(partial_code, str):
-                    partial_code.finalize_hook("<STORE_OUTPUT>")
+            num_store_subgraphs = kernel.get_store_output_count()
+            for i in range(num_store_subgraphs):
+                subgraph_name = kernel._get_store_output_subgraph_name(i)
+                partial_code.finalize_hook(subgraph_name)
 
             if isinstance(partial_code, str):
                 src_code = partial_code
@@ -1687,6 +1703,51 @@ def _codegen_single_template(
 
             return kernel
 
+    def _get_multikernel_shapes(
+        self, node: MultiTemplateBuffer
+    ) -> tuple[tuple[int, ...], ...]:
+        from ..ir import IRNode
+
+        def get_size(arg):
+            if not isinstance(arg, IRNode) or (size := arg.maybe_get_size()) is None:
+                return None
+            return tuple(s for s in size)
+
+        out = []
+        for arg in list(node.inputs) + [node]:
+            if isinstance(arg, (list, tuple)):
+                out.append(tuple(get_size(_arg) for _arg in arg))
+            else:
+                out.append(get_size(arg))
+        return tuple(out)
+
+    def _kernel_has_dynamic_shapes(self, node: MultiTemplateBuffer) -> bool:
+        shapes = self._get_multikernel_shapes(node)
+        return any(
+            any(
+                isinstance(s, sympy.Expr) and not isinstance(s, sympy.Integer)
+                for s in shape
+            )
+            for shape in shapes
+        )
+
+    def _make_shape_cache_key(
+        self, node: MultiTemplateBuffer, hint: int
+    ) -> tuple[tuple[int, ...], ...]:
+        """
+        Returns cache key for hint-based multi-graph; key is tuple of shapes with hint filled in.
+        """
+        shapes = self._get_multikernel_shapes(node)
+        return tuple(
+            tuple(
+                hint
+                if isinstance(s, sympy.Expr) and not isinstance(s, sympy.Integer)
+                else s
+                for s in shape
+            )
+            for shape in shapes
+        )
+
     def codegen_template(
         self,
         template_node,
@@ -1709,11 +1770,16 @@ def codegen_template(
         if (
             isinstance(template_node.node, MultiTemplateBuffer)
             and template_node.node._make_kernel_renders
+            and len(template_node.node._make_kernel_renders) > 1
+            and self._kernel_has_dynamic_shapes(template_node.node)
         ):
-            kernels = []
+            kernels = {}
             src_codes = []
 
-            for make_kernel_render in template_node.node._make_kernel_renders.values():
+            for (
+                size_hint,
+                make_kernel_render,
+            ) in template_node.node._make_kernel_renders.items():
                 kernel, render = make_kernel_render(
                     template_node.node, hint_override=hint_override
                 )
@@ -1730,6 +1796,8 @@ def codegen_template(
                     assert isinstance(src_code, str)
                     src_codes.append(src_code)
                 else:
+                    if size_hint is None:
+                        continue  # skip kernel generation based on real runtime value; only use hints
                     kernel = self._codegen_single_template(
                         kernel,
                         render,
@@ -1738,13 +1806,18 @@ def codegen_template(
                         prologue_nodes,
                         only_gen_src_code=False,
                     )
-                    kernels.append(kernel)
+                    shape_cache_key = (
+                        None
+                        if size_hint is None
+                        else self._make_shape_cache_key(template_node.node, size_hint)
+                    )
+                    kernels[shape_cache_key] = kernel
 
             if only_gen_src_code:
                 return "\n\n".join(src_codes)
 
-            MultiKernel.merge_workspaces_inplace(kernels)
-            multi_kernel = MultiKernel(kernels)
+            MultiKernel.merge_workspaces_inplace(list(kernels.values()))
+            multi_kernel = SizeHintMultiKernel(kernels)
             node_schedule = [*prologue_nodes, template_node, *epilogue_nodes]
             self.codegen_comment(node_schedule)
 
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 121b64cdd6a6a..d833d555d7f18 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -26,6 +26,7 @@
 from torch._prims_common import is_integer_dtype
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import CeilDiv, FloorDiv, ModularIndexing
+from torch.utils._sympy.value_ranges import bound_sympy
 from torch.utils._triton import has_triton_package, has_triton_stable_tma_api
 
 from ...utils._sympy.symbol import free_symbol_is_type, prefix_str, symbol_is_type, SymT
@@ -253,6 +254,9 @@ class BlockDescriptorOptions:
     broadcasting_dims: list[bool]
     final_shape: Sequence[sympy.Expr]
     _boundary_check: Optional[list[int]] = None
+    # Can we safely lift the constructor
+    # to the top of the kernel?
+    can_lift: bool = False
 
     @property
     def shape(self) -> list[sympy.Expr]:
@@ -279,6 +283,8 @@ def create(
         range_trees: list[IterationRangesRoot],
         mask_vars: OrderedSet[str],
         get_max_block: Callable[[str], int],
+        can_lift=False,
+        transpose_contiguous=False,
     ) -> BlockDescriptorOptions:
         """Helper to create a BlockDescriptorOptions instance"""
 
@@ -336,8 +342,14 @@ def remove_dims(it):
 
         # Drop removable dimensions from the input.
         params = BlockParameters(
-            **{key: remove_dims(val) for key, val in dataclasses.asdict(params).items()}
+            **{
+                key: remove_dims(val) for key, val in dataclasses.asdict(params).items()
+            },
         )
+        # TODO: Generalize to ND tensors.
+        transpose = transpose_contiguous and params.strides[-1] != 1
+        if transpose:
+            params = params.transpose()
 
         # Compute the final shape, adjusting for special kernel types.
         final_shape = [TritonSymbols.get_block_size(tree) for tree in range_trees]
@@ -345,6 +357,12 @@ def remove_dims(it):
             assert range_trees[0].prefix == "x"
             final_shape.pop(0)
 
+        # Check for when BlockParams have been transposed.
+        order = list(reversed(range(len(params.shape))))
+        if transpose:
+            final_shape.reverse()
+            order.reverse()
+
         reduction_ndim = V.kernel.num_reduction_dims
         if (
             not V.kernel.inside_reduction
@@ -357,11 +375,12 @@ def remove_dims(it):
         result = cls(
             params=params,
             constant_offset=V.graph.sizevars.lookup_precomputed_size(constant_offset),
-            order=list(reversed(range(len(params.shape)))),
+            order=order,
             mask_vars=mask_vars,
             final_shape=final_shape,
             broadcast_shape=broadcast_shape,
             broadcasting_dims=broadcasting_dims,
+            can_lift=can_lift,
         )
         result.compute_boundary_check(get_max_block, range_trees)
         return result
@@ -687,7 +706,7 @@ def _print_CeilToInt(self, expr: sympy.Expr) -> str:
         return f"libdevice.ceil({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
 
     def _helper_sqrt(self, expr: sympy.Expr) -> str:
-        return f"libdevice.sqrt(({self._print(expr)}).to(tl.float32))"
+        return f"tl.sqrt_rn(({self._print(expr)}).to(tl.float32))"
 
     def _print_FloatPow(self, expr: sympy.Expr) -> str:
         return (
@@ -1079,7 +1098,7 @@ def expm1(x):
     @staticmethod
     @maybe_upcast_float32()
     def sqrt(x):
-        return f"libdevice.sqrt({x})"
+        return f"tl.sqrt_rn({x})"
 
     @staticmethod
     def relu(x):
@@ -1641,6 +1660,14 @@ def __add__(self, other: BlockParameters) -> BlockParameters:
         a, b = tuple(dataclasses.asdict(x) for x in (self, other))
         return cls(**{key: a[key] + b[key] for key in a})
 
+    def transpose(self) -> BlockParameters:
+        return BlockParameters(
+            self.shape[::-1],
+            self.block_shape[::-1],
+            self.strides[::-1],
+            self.offsets[::-1],
+        )
+
 
 class CooperativeReductionWorkspaceCache:
     """
@@ -1711,6 +1738,7 @@ class TMACompatibilityChecker:
     kernel: TritonKernel
     dtype: torch.dtype
     for_store: bool
+    force: bool
 
     def __post_init__(self):
         self.failed_debug_prefix = "Cannot use TMA descriptor for load / store since: "
@@ -1719,6 +1747,8 @@ def __post_init__(self):
     def can_use_tma(
         self,
     ) -> bool:
+        if self.force:
+            return True
         if not (
             V.graph.get_current_device_or_throw().type == "cuda"
             and torch.cuda.get_device_capability()[0] >= 9
@@ -1755,12 +1785,19 @@ def are_block_parameters_compatible(
     ) -> bool:
         """
         Check if the block parameters are valid for TMA.
+        If force, we allow relying on symbolic hints equivalent
+        to what we check for Triton templates.
         """
+        if self.force:
+            strides = [
+                V.graph.sizevars.symbolic_hint(st) for st in block_params.strides
+            ]
+        else:
+            strides = block_params.strides
+
         # The TMA API requires that the innermost stride is 1
         # and that the outer strides are 16 byte aligned
-        if not V.graph.sizevars.statically_known_equals(
-            block_params.strides[-1], sympy.Integer(1)
-        ):
+        if not V.graph.sizevars.statically_known_equals(strides[-1], sympy.Integer(1)):
             log.debug(
                 "%s TMA API requires innermost stride to be 1.",
                 self.failed_debug_prefix,
@@ -1768,7 +1805,7 @@ def are_block_parameters_compatible(
             return False
 
         element_size = self.dtype.itemsize
-        for stride in block_params.strides[:-1]:
+        for stride in strides[:-1]:
             if not V.graph.sizevars.statically_known_equals(
                 ModularIndexing(stride * element_size, 1, sympy.Integer(16)),
                 sympy.Integer(0),
@@ -1870,6 +1907,18 @@ def are_block_parameters_compatible(
 
         return True
 
+    def can_lift(self) -> bool:
+        """
+        Can you lift the make_tensor_descriptor
+        call to the top of the kernel? This requires
+        being certain that all of the shape, stride,
+        and block_shape information is handled in arguments
+        or top level definitions.
+
+        Right now we assume this is always possible if you force TMA.
+        """
+        return self.force
+
 
 class TritonKernel(SIMDKernel[TritonCSEVariable]):
     """A class to represent a triton kernel and helpers to generate
@@ -1895,6 +1944,9 @@ def __init__(
         self.fixed_config = fixed_config
         super().__init__(tiling, **kwargs)
         self.cse = TritonCSE(self.newvar_prefix, self.suffix)
+        # Cache of values that can be reused for the prologue.
+        self.prologue_cache: dict[str, str] = {}
+        self.prologue: IndentedBuffer = IndentedBuffer()
         self.post_loop_combine: IndentedBuffer = IndentedBuffer()
         self.post_loop_store: IndentedBuffer = IndentedBuffer()
         self.outside_loop_vars = OrderedSet[Any]()
@@ -2045,7 +2097,7 @@ def indexing(
         self,
         index: sympy.Expr,
         *,
-        copy_shape=None,
+        copy_shape: Optional[Union[str, tuple[str]]] = None,
         dense_indexing=False,
         override_mask=None,
         block_ptr=False,
@@ -2089,6 +2141,8 @@ def indexing(
                     for symt in TritonSymbols.block_types
                     if symbol_is_type(var, symt)
                 ]
+                if len(prefix_matches) == 0:
+                    pass
                 assert len(prefix_matches) == 1, f"Ambiguous type: {var.name}"
                 mask_vars.add(f"{prefix_matches[0]}mask")
 
@@ -2304,16 +2358,29 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
                     if config.triton.use_block_ptr
                     else TensorDescriptorOptions
                 )
+                nonlocal tma_compatibility_checker
+                if config.triton.use_block_ptr:
+                    can_lift = False
+                    transpose_contiguous = False
+                else:
+                    tma_compatibility_checker = cast(
+                        TMACompatibilityChecker, tma_compatibility_checker
+                    )
+                    can_lift = tma_compatibility_checker.can_lift()
+                    # Only try transpose if we know the output shape
+                    # in case we need to transpose the data.
+                    transpose_contiguous = copy_shape is not None
+
                 options = options_class.create(
                     params=block_params,
                     constant_offset=offset,
                     range_trees=range_trees,
                     mask_vars=mask_vars,
                     get_max_block=self.max_block,
+                    can_lift=can_lift,
+                    transpose_contiguous=transpose_contiguous,
                 )
-
                 if options_class == TensorDescriptorOptions:
-                    nonlocal tma_compatibility_checker
                     tma_compatibility_checker = cast(
                         TMACompatibilityChecker, tma_compatibility_checker
                     )
@@ -2328,13 +2395,21 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
             options = match_block_expr()
             if options is not None:
                 return options
-
         expand_str = None
         expand_shape: BlockShapeType = None
         index_str = self.index_to_str(index)
+
+        def _get_expand_str():
+            if copy_shape:
+                if isinstance(copy_shape, str):
+                    return f"{copy_shape}.shape", None
+                else:
+                    return "[" + ", ".join(str(c) for c in copy_shape) + "]", copy_shape
+            else:
+                return self.dense_size_str(), tuple(self.dense_size_list())
+
         if isinstance(index, sympy.Integer):
-            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
-            expand_shape = None if copy_shape else tuple(self.dense_size_list())
+            expand_str, expand_shape = _get_expand_str()
             index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
             if self.fixed_config and not self._has_constant_xmask():
                 mask_vars = OrderedSet(["xmask"])
@@ -2352,17 +2427,17 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
             )
 
         if need_dense and not have_dense:
-            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
-            expand_shape = None if copy_shape else tuple(self.dense_size_list())
+            expand_str, expand_shape = _get_expand_str()
             index_str = f"tl.broadcast_to({index_str}, {expand_str})"
             mask_vars = dense_mask_vars
         elif not have_loop_vars and copy_shape:
-            index_str = f"tl.broadcast_to({index_str}, {copy_shape}.shape)"
+            expand_shape_str, expand_shape = _get_expand_str()
+            index_str = f"tl.broadcast_to({index_str}, {expand_shape_str})"
             mask_vars = dense_mask_vars
 
         if expand_shape is None:
             if need_dense or have_dense:
-                expand_shape = None if copy_shape else tuple(self.dense_size_list())
+                _, expand_shape = _get_expand_str()
             else:
                 expand_shape = ()
 
@@ -2411,63 +2486,94 @@ def codegen_block_ptr(
             self.inside_reduction
             and self.range_trees[-1].is_loop
             and indexing.has_rindex()
-        ):
-            block_descriptor_id = next(self.block_ptr_id)
-            if isinstance(indexing, BlockPtrOptions):
-                block_descriptor = f"block_ptr{block_descriptor_id}"
+        ) or indexing.can_lift:
+            if indexing.can_lift and var in self.prologue_cache:
+                # Check for epilogue subtiling to reuse the same
+                # tensor descriptor.
+                block_descriptor = self.prologue_cache[var]
             else:
-                block_descriptor = f"tma_descriptor{block_descriptor_id}"
-            self.body.writeline(
-                DeferredLine(
+                block_descriptor_id = next(self.block_ptr_id)
+                if isinstance(indexing, BlockPtrOptions):
+                    block_descriptor = f"block_ptr{block_descriptor_id}"
+                else:
+                    block_descriptor = f"tma_descriptor{block_descriptor_id}"
+                line_body = DeferredLine(
                     name, f"{block_descriptor} = {indexing.format(var, roffset=False)}"
                 )
-            )
-
-            if isinstance(indexing, BlockPtrOptions):
-                # Store for later use. If the buffer is removed the below advancements
-                # are no longer necessary
-                self.block_ptr_to_buffer[block_descriptor] = name
+                if indexing.can_lift:
+                    self.prologue.writeline(line_body)
+                    # Cache the descriptor for epilogue subtiling
+                    self.prologue_cache[var] = block_descriptor
+                else:
+                    self.body.writeline(line_body)
 
-                # Generate block pointer advancements, for later use.
-                for symt in TritonSymbols.reduction_types:
-                    advance_offsets = indexing.advance_roffset(symt)
+                if isinstance(indexing, BlockPtrOptions):
+                    # Store for later use. If the buffer is removed the below advancements
+                    # are no longer necessary
+                    self.block_ptr_to_buffer[block_descriptor] = name
+
+                    # Generate block pointer advancements, for later use.
+                    for symt in TritonSymbols.reduction_types:
+                        advance_offsets = indexing.advance_roffset(symt)
+
+                        # Ignore identity advancements.
+                        if all(
+                            V.graph.sizevars.statically_known_equals(
+                                offset, sympy.Integer(0)
+                            )
+                            for offset in advance_offsets
+                        ):
+                            continue
 
-                    # Ignore identity advancements.
-                    if all(
-                        V.graph.sizevars.statically_known_equals(
-                            offset, sympy.Integer(0)
+                        advancements = self.pointer_advancements[symt]
+                        assert block_descriptor not in advancements, (
+                            f"duplicate advancement for pointer '{block_descriptor}' at type '{symt}'"
                         )
-                        for offset in advance_offsets
-                    ):
-                        continue
-
-                    advancements = self.pointer_advancements[symt]
-                    assert block_descriptor not in advancements, (
-                        f"duplicate advancement for pointer '{block_descriptor}' at type '{symt}'"
-                    )
-                    advancements[block_descriptor] = advance_offsets
+                        advancements[block_descriptor] = advance_offsets
         else:
             block_descriptor = indexing.format(var)
         return block_descriptor, other
 
     def codegen_block_ptr_store_line(self, name, indexing, block_ptr, value, other=""):
-        # Stores require an explicit broadcast. We do this in two phases:
-        #  1. Broadcast the operand to the final shape of the range trees, e.g. [ZBLOCK,
-        #     YBLOCK, XBLOCK]. This protects against implicit broadcasting from loads.
-        #  2. In case the block pointer / tma descriptor has different dimensionality, broadcast/reshape the
-        #     result to the shape of the pointer.
-        value = f"tl.broadcast_to({value}, {indexing.final_shape})"
-
-        # These dims no longer need broadcasting.
-        for idx, (dim, broadcast_dim) in enumerate(
-            zip(indexing.final_shape, indexing.broadcast_shape)
-        ):
-            if V.graph.sizevars.statically_known_equals(dim, broadcast_dim):
-                indexing.broadcasting_dims[idx] = False
+        # TMA stores may require transposing the data to ensure we are contiguous along
+        # the final dimension. We do this by checking the shape information on value.
+        # It can either
+        #    1. Match the final shape. In this case no broadcast/reshape
+        #       is necessary.
+        #    2. Exist as the Transpose of the final shape, which means we had to transpose
+        #       the store_descriptor relative to the accumulator indexing/value. If this
+        #       happens we will generate a tl.trans().
+        #    3. A mismatched provided shape. When this occurs we will error.
+        #    4. No shape is provided. This will proceed with the default explicit broadcast
+        #       described below.
+        #
+        # To prevent unintended side effects we will gate options 1-3 behind isinstance(indexing, TensorDescriptorOptions).
+        if isinstance(indexing, TensorDescriptorOptions) and value.shape:
+            str_final_shape = tuple([symt.name for symt in indexing.final_shape])
+            if value.shape[::-1] == str_final_shape:
+                value = f"tl.trans({value})"
+            elif value.shape != str_final_shape:
+                raise AssertionError(
+                    "TMA store requires no broadcasting when a shape is provided"
+                )
+        else:
+            # Stores require an explicit broadcast. We do this in two phases:
+            #  1. Broadcast the operand to the final shape of the range trees, e.g. [ZBLOCK,
+            #     YBLOCK, XBLOCK]. This protects against implicit broadcasting from loads.
+            #  2. In case the block pointer / tma descriptor has different dimensionality, broadcast/reshape the
+            #     result to the shape of the pointer.
+            value = f"tl.broadcast_to({value}, {indexing.final_shape})"
+
+            # These dims no longer need broadcasting.
+            for idx, (dim, broadcast_dim) in enumerate(
+                zip(indexing.final_shape, indexing.broadcast_shape)
+            ):
+                if V.graph.sizevars.statically_known_equals(dim, broadcast_dim):
+                    indexing.broadcasting_dims[idx] = False
 
-        value = indexing.codegen_broadcast_and_reshape(
-            value, indexing.final_shape, indexing.block_shape, False
-        )
+            value = indexing.codegen_broadcast_and_reshape(
+                value, indexing.final_shape, indexing.block_shape, False
+            )
 
         # workaround https://github.com/triton-lang/triton/issues/2814
         value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})"
@@ -2531,7 +2637,10 @@ def load(self, name: str, index: sympy.Expr):
             index,
             block_ptr=True,
             tma_compatibility_checker=self.tma_compatibility_checker_cls(
-                self, dtype, for_store=False
+                self,
+                dtype,
+                for_store=False,
+                force=False,
             ),
         )
         has_rindex = indexing.has_rindex()
@@ -2643,6 +2752,8 @@ def decide_later():
                 dtype = torch.bool
 
         load_buffer = self.get_load_buffer(indexing)
+        if config.triton.enable_pdl:
+            load_buffer.writeline("tl.extra.cuda.gdc_wait()")
         result_var = self.cse.generate(
             load_buffer, make_line(line), dtype=dtype, shape=shape
         )
@@ -2684,9 +2795,13 @@ def store(
         dtype = V.graph.get_dtype(name)
 
         tma_compatibility_checker = None
-        if mode is None:
+        if mode is None or mode == "tma":
+            force = mode == "tma"
             tma_compatibility_checker = self.tma_compatibility_checker_cls(
-                self, dtype, for_store=True
+                self,
+                dtype,
+                for_store=True,
+                force=force,
             )
         indexing = self.indexing(
             index,
@@ -3460,7 +3575,10 @@ def store_reduction(
             index,
             block_ptr=True,
             tma_compatibility_checker=self.tma_compatibility_checker_cls(
-                kernel=self, dtype=dtype, for_store=True
+                kernel=self,
+                dtype=dtype,
+                for_store=True,
+                force=False,
             ),
         )
         self.inside_reduction = True
@@ -3759,6 +3877,19 @@ def cse_multiple(line, broadcasted_values, masks, dtypes):
 
         return tuple(result_vars)
 
+    def codegen_prologue(self, code: IndentedBuffer):
+        """
+        Generate the output from prologue. This should be
+        extracted from the subgraph, which is why this is
+        partitioned from codegen_body.
+        """
+        if not self.prologue:
+            return
+
+        code.splice(self.prologue)
+        self.prologue.clear()
+        self.prologue_cache.clear()
+
     def codegen_body(self):
         """
         Concat output code from index_code, loads, compute, stores,
@@ -3868,14 +3999,30 @@ def kernel_benchmark_extra_args(self) -> list[str]:
                 if isinstance(arg, int):
                     args.append(str(arg))
                 elif isinstance(arg, SymbolicCallArg):
-                    args.append(str(V.graph.sizevars.size_hint(arg.inner_expr)))
+                    hint = V.graph.sizevars.size_hint(
+                        arg.inner_expr, fallback=config.unbacked_symint_fallback
+                    )
+                    args.append(str(hint))
                 elif isinstance(arg, sympy.Expr):
-                    args.append(str(V.graph.sizevars.size_hint(arg)))
+                    hint = V.graph.sizevars.size_hint(
+                        arg, fallback=config.unbacked_symint_fallback
+                    )
+                    args.append(str(hint))
                 else:
                     raise ValueError(f"Unsupported numel argument type: {type(arg)}")
         return args
 
-    def codegen_kernel_benchmark(self, num_gb):
+    def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
+        """
+        Generates Python code for benchmarking this Triton kernel.
+        - Creates example inputs (random tensors, constants, sizes).
+        - Runs the kernel on the current GPU/stream.
+        - Prints runtime (ms) and throughput (GB/s) using `num_gb`.
+        Args:
+            num_gb (float): The number of gigabytes to use for throughput calculation.
+        Returns:
+            IndentedBuffer: A buffer containing the generated Python benchmark code.
+        """
         result = IndentedBuffer()
         _argdefs, call_args, signature, _ = self.args.python_argdefs()
 
@@ -3887,14 +4034,34 @@ def codegen_kernel_benchmark(self, num_gb):
                 var_name = f"arg_{next(name_cnt)}"
                 buf = V.graph.try_get_buffer(arg_name)
                 if buf:
+                    size = V.graph.sizevars.size_hints(
+                        buf.get_size(),
+                        hint_override=self.hint_override,
+                        fallback=config.unbacked_symint_fallback,
+                    )
+                    stride = V.graph.sizevars.size_hints(
+                        buf.get_stride(),
+                        hint_override=self.hint_override,
+                        fallback=config.unbacked_symint_fallback,
+                    )
                     result.writeline(
-                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(buf.get_size(), hint_override=self.hint_override)}, {V.graph.sizevars.size_hints(buf.get_stride(), hint_override=self.hint_override)}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+                        f"{var_name} = rand_strided({size}, {stride}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
                     )
                 elif arg_name in V.graph.constants:
                     # note that random seed is put in V.graph.constants
                     const_tensor = V.graph.constants[arg_name]
+                    size = V.graph.sizevars.size_hints(
+                        const_tensor.size(),
+                        hint_override=self.hint_override,
+                        fallback=config.unbacked_symint_fallback,
+                    )
+                    stride = V.graph.sizevars.size_hints(
+                        const_tensor.stride(),
+                        hint_override=self.hint_override,
+                        fallback=config.unbacked_symint_fallback,
+                    )
                     result.writeline(
-                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size(), hint_override=self.hint_override)}, {V.graph.sizevars.size_hints(const_tensor.stride(), hint_override=self.hint_override)}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
+                        f"{var_name} = rand_strided({size}, {stride}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
                     )
                 elif isinstance(arg_sig, SizeArg):
                     symval_hint = V.graph.sizevars.size_hint(arg_sig.expr)
@@ -4169,6 +4336,7 @@ def add_constexpr_arg(arg_name):
             "optimize_mem": optimize_mem,
             "no_x_dim": self.no_x_dim,
             "num_load": self.num_load,
+            "num_store": self.num_store,
             "num_reduction": self.num_reduction,
             **self.inductor_meta_common(),
         }
@@ -4176,17 +4344,27 @@ def add_constexpr_arg(arg_name):
         # Bail on 3d tiling, which has more complicated coalesce patterns
         looped_red = V.kernel.features.is_reduction() and not self.persistent_reduction
         tiling_scores = self.tiling_scores
-        two_d_red = (
-            len(self.tiling) == 2 and tiling_scores is not None and "x" in tiling_scores
-        )
+        two_d_red = len(self.tiling) == 2
         if looped_red and two_d_red:
-            assert tiling_scores is not None
             memory_stats = self.features.memory_stats(self.tiling)
             dim_stats = memory_stats.persistent.memory.dim[0]
             mem_ops_per_thread = dim_stats.count_per_thread
 
-            # check if majority of reads are coalesced by the rblock
-            r_coalesce_ratio = tiling_scores["r0_"] / max(tiling_scores["x"], 1)
+            if (
+                tiling_scores is not None
+                and "x" in tiling_scores
+                and "r0_" in tiling_scores
+            ):
+                # large rblock inhibits xblock size, dont attempt if there is a decent amount of
+                # reads coalesced by xblock
+                r_coalesce_ratio = tiling_scores["r0_"] / max(tiling_scores["x"], 1)
+                contiguous_red = r_coalesce_ratio >= 8.0
+            else:
+                from torch._inductor.runtime.hints import ReductionHint
+
+                contiguous_red = (
+                    self.features.get_reduction_hint() == ReductionHint.INNER
+                )
 
             looped_mem = memory_stats.looped.memory.bytes
             persistent_mem = memory_stats.persistent.memory.bytes
@@ -4204,9 +4382,7 @@ def add_constexpr_arg(arg_name):
             if (
                 # significant memory bandwidth savings
                 saved_bytes_ratio >= 1.3
-                # large rblock inhibits xblock size, dont attempt if there is a decent amount of
-                # reads coalesced by xblock
-                and r_coalesce_ratio >= 8.0
+                and contiguous_red
                 # TODO - need more detailed register analysis
                 and V.graph.sizevars.statically_known_leq(
                     self.features.reduction_numel, 32768
@@ -4240,6 +4416,9 @@ def add_constexpr_arg(arg_name):
 
         triton_meta["configs"] = [config_of(signature)]
 
+        if config.triton.enable_pdl:
+            triton_meta["launch_pdl"] = True
+
         # Triton compiler includes equal_to_1 args into constants even
         # when they are not constexpr. otherwise there may be a segfault
         # during launching the Inductor-compiled Triton kernel.
@@ -4247,9 +4426,11 @@ def add_constexpr_arg(arg_name):
         # https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
         for arg_num in equal_1_arg_indices(signature):  # type: ignore[index]
             triton_meta["constants"][signature[arg_num].name] = 1  # type: ignore[index,union-attr]
+        triton_meta["enable_fp_fusion"] = not config.emulate_precision_casts
 
         self.triton_meta = triton_meta
 
+        self.codegen_prologue(self.body)
         self.codegen_body()
 
         for helper in self.helper_functions:
@@ -4319,11 +4500,17 @@ def _get_persistent_RBLOCK(rnumel):
             val = int(rnumel)
             val = next_power_of_2(val)
         else:
-            val = 128
-            while not V.graph.sizevars.statically_known_leq(rnumel, val):
-                if val > 16 * 1024:
-                    raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
-                val *= 2
+            val = bound_sympy(rnumel).upper
+            assert isinstance(val, int) or val.is_constant()
+
+            if val == torch.utils._sympy.numbers.IntInfinity():
+                raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
+
+            val = next_power_of_2(int(val))
+
+            if val > 16 * 1024:
+                raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
+
         return val
 
     @staticmethod
diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
index 94a905e4211ce..0e876c8cf7fe0 100644
--- a/torch/_inductor/codegen/triton_combo_kernel.py
+++ b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -90,7 +90,10 @@ def _default_custom_combo_kernel_horizontal_partition(
         long_reduction = [
             n
             for n in reduction
-            if V.graph.sizevars.size_hint(n.group[-1][-1]) > 2048  # type: ignore[arg-type]
+            if (
+                V.graph.sizevars.shape_env.has_hint(n.group[-1][-1])
+                and V.graph.sizevars.size_hint(n.group[-1][-1]) > 2048  # type: ignore[arg-type]
+            )
         ]
         short_reduction = [n for n in reduction if n not in long_reduction]
         if long_reduction:
@@ -103,6 +106,7 @@ def _default_custom_combo_kernel_horizontal_partition(
             for n in not_reduction
             if not kernel_map[n].inside_reduction
             and len(kernel_map[n].numels) == 2
+            and V.graph.sizevars.shape_env.has_hint(kernel_map[n].numels["x"])
             and V.graph.sizevars.size_hint(kernel_map[n].numels["x"]) > LARGE_NUMELS
         ]
         if large_pointwise:
@@ -485,7 +489,11 @@ def min_x_blocks_sub_kernel(self, sub_kernel: TritonKernel, num: int) -> None:
 
     def select_heuristics(self, sub_kernel: TritonKernel) -> tuple[str, dict[str, int]]:
         size_hints = {
-            prefix: next_power_of_2(V.graph.sizevars.size_hint(numel))
+            prefix: next_power_of_2(
+                V.graph.sizevars.size_hint(
+                    numel, fallback=config.unbacked_symint_fallback
+                )
+            )
             for prefix, numel in sub_kernel.numels.items()
             if not prefix_is_reduction(prefix) or sub_kernel.inside_reduction
         }
@@ -726,7 +734,13 @@ def kernel_benchmark_extra_args(self) -> list[str]:
                 if numel_name not in self.dynamic_shape_args:
                     continue
                 if not tree.is_reduction or sub_kernel.inside_reduction:
-                    extra_args.append(str(V.graph.sizevars.size_hint(tree.numel)))
+                    extra_args.append(
+                        str(
+                            V.graph.sizevars.size_hint(
+                                tree.numel, fallback=config.unbacked_symint_fallback
+                            )
+                        )
+                    )
         return extra_args
 
     def codegen_kernel(self, name: Optional[str] = None) -> str:
@@ -750,6 +764,14 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
         if config.benchmark_combo_kernel:
             code.splice(self.imports_for_benchmark_kernel())
 
+        seen_helpers: OrderedSet[str] = OrderedSet()
+        for sub_kernel in self.sub_kernels:
+            for helper in sub_kernel.helper_functions:
+                if helper not in seen_helpers:
+                    code.writeline("")
+                    code.splice(helper)
+                    seen_helpers.add(helper)
+
         argdefs, _, signature, _ = self.args.python_argdefs()
         argdefs = self.add_numel_to_args(argdefs, signature)
         block_args = self.get_block_args()
@@ -800,6 +822,16 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
         return code.getvalue()
 
     def codegen_kernel_benchmark(self, num_gb: float) -> IndentedBuffer:
+        """
+        Generates Python code for benchmarking this combo kernel.
+        - Creates example inputs (random tensors, constants, sizes).
+        - Runs the kernel on the current GPU/stream.
+        - Prints runtime (ms) and throughput (GB/s) using `num_gb`.
+        Args:
+            num_gb (float): The number of gigabytes to use for throughput calculation.
+        Returns:
+            IndentedBuffer: A buffer containing the generated Python benchmark code.
+        """
         result = IndentedBuffer()
         _argdefs, call_args, signature, _ = self.args.python_argdefs()
         result.writelines(["", "", "def get_args():"])
@@ -810,14 +842,26 @@ def codegen_kernel_benchmark(self, num_gb: float) -> IndentedBuffer:
                 var_name = f"arg_{next(name_cnt)}"
                 buf = V.graph.try_get_buffer(arg_name)
                 if buf:
+                    size = V.graph.sizevars.size_hints(
+                        buf.get_size(), fallback=config.unbacked_symint_fallback
+                    )
+                    stride = V.graph.sizevars.size_hints(
+                        buf.get_stride(), fallback=config.unbacked_symint_fallback
+                    )
                     result.writeline(
-                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(buf.get_size())}, {V.graph.sizevars.size_hints(buf.get_stride())}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+                        f"{var_name} = rand_strided({size}, {stride}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
                     )
                 elif arg_name in V.graph.constants:
                     # note that random seed is put in V.graph.constants
                     const_tensor = V.graph.constants[arg_name]
+                    size = V.graph.sizevars.size_hints(
+                        const_tensor.size(), fallback=config.unbacked_symint_fallback
+                    )
+                    stride = V.graph.sizevars.size_hints(
+                        const_tensor.stride(), fallback=config.unbacked_symint_fallback
+                    )
                     result.writeline(
-                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size())}, {V.graph.sizevars.size_hints(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
+                        f"{var_name} = rand_strided({size}, {stride}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
                     )
                 elif isinstance(arg_sig, SizeArg):
                     symval_hint = V.graph.sizevars.size_hint(arg_sig.expr)
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index 9d1b82d9b9334..30db5ebe90af9 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -390,6 +390,19 @@ def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
         return converter._generate_enter_subgraph
 
 
+@dataclasses.dataclass
+class ConditionalLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    node: ir.Conditional
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        raise NotImplementedError("Only supports FX codegen")
+
+    @staticmethod
+    def codegen_fx(converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_conditional
+
+
 @dataclasses.dataclass
 class CommentLine(WrapperLine):
     line: LineContext
@@ -908,6 +921,56 @@ def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
         return converter._generate_multi_output
 
 
+@dataclasses.dataclass
+class IndexPutFallbackLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    node: ir.IndexPutFallback
+    indices: list[Optional[ir.IRNode]]
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        node = self.node
+        assert ir.is_node_sequence(node.inputs)
+        (x, values) = (t.codegen_reference() for t in node.inputs[:2])
+        indices = [
+            idx.codegen_reference() if idx else self.wrapper.none_str
+            for idx in self.indices
+        ]
+
+        self.wrapper._generate_index_put_fallback(
+            node.get_kernel_name(), x, indices, values, *node.codegen_const_args()
+        )
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_index_put_fallback
+
+
+@dataclasses.dataclass
+class ScatterFallbackLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    node: ir.ScatterFallback
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        node = self.node
+        assert ir.is_node_sequence(node.inputs)
+        if node.src_is_tensor:
+            (x, index, src) = (t.codegen_reference() for t in node.inputs)
+        else:
+            (x, index) = (t.codegen_reference() for t in node.inputs)
+            src = node.constant_args[1]
+        self.wrapper._generate_scatter_fallback(
+            x,
+            [x, node.constant_args[0], index, src],
+            node.cpp_kernel_name,
+            node.python_kernel_name,
+            node.src_is_tensor,
+            node.kwargs["reduce"],
+            node.codegen_kwargs(),
+        )
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_scatter_fallback
+
+
 @dataclasses.dataclass
 class SymbolicCallArgLine(WrapperLine):
     wrapper: PythonWrapperCodegen
@@ -1152,14 +1215,15 @@ def write_triton_header_once(self) -> None:
             )
 
     def write_get_raw_stream_header(self) -> None:
+        import_get_raw_stream_str = V.graph.device_ops.import_get_raw_stream_as(
+            "get_raw_stream"
+        )
         if config.triton.autotune_at_compile_time:
-            self.kernel_autotune_calls.writeline(
-                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
-            )
+            if not self.kernel_autotune_calls.contains(import_get_raw_stream_str):
+                self.kernel_autotune_calls.writeline(import_get_raw_stream_str)
         if not V.graph.cpp_wrapper:
-            self.imports.writeline(
-                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
-            )
+            if not self.imports.contains(import_get_raw_stream_str):
+                self.imports.writeline(import_get_raw_stream_str)
 
     @cache_on_self
     def write_get_raw_stream_header_once(self) -> None:
@@ -1306,7 +1370,7 @@ def codegen_input_size_and_nan_asserts(self) -> None:
     # that stream caching happens per graph instance. this
     # is important for nested subgraph codegening.
     def write_get_raw_stream(self, device_idx: int, graph_name: str) -> str:
-        self.write_get_raw_stream_header_once()
+        self.write_get_raw_stream_header()
         name = f"stream{device_idx}"
         if config.triton.autotune_at_compile_time:
             self.kernel_autotune_calls.writeline(
@@ -1511,7 +1575,10 @@ def generate_tma_descriptor(self, desc):
         line = f"{desc.name} = {call}{self.ending}"
         self.writeline(line)
 
-    def generate_scatter_fallback(
+    def generate_scatter_fallback(self, node: ir.ScatterFallback):
+        self.writeline(ScatterFallbackLine(self, node))
+
+    def _generate_scatter_fallback(
         self,
         output,
         inputs,
@@ -1530,7 +1597,22 @@ def generate_scatter_fallback(
         line += ")"
         self.writeline(line)
 
-    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+    def generate_index_put_fallback(self, node: ir.IndexPutFallback) -> None:
+        # Collect index tensors into a list.
+        indices: list[Optional[ir.IRNode]] = []
+        valid_indices = node.inputs[2:]
+        iter_valid_indices = iter(valid_indices)
+        for i, _ in enumerate(node.indices):
+            if node.indices[i] is not None:
+                index = next(iter_valid_indices)
+                assert isinstance(index, ir.IRNode)
+                indices.append(index)
+            else:
+                indices.append(None)
+
+        self.writeline(IndexPutFallbackLine(self, node, indices))
+
+    def _generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
         indices_str = f"[{', '.join(indices)}]"
         args = [x, indices_str, values, accumulate]
         self.writeline(self.wrap_kernel_call(kernel, args))
@@ -2073,6 +2155,10 @@ def define_kernel(
     def _format_kernel_definition(
         kernel_name: str, kernel_body: str, metadata: Optional[str] = None
     ):
+        if config.triton.autotune_at_compile_time and metadata:
+            # Generating autotune block
+            # Need to replace C++ comment starter with Python comment starter
+            metadata = re.sub(r"^// ", "# ", metadata, flags=re.MULTILINE)
         metadata_comment = f"{metadata}\n" if metadata else ""
         body = f"\n\n{metadata_comment}{kernel_name} = {kernel_body}"
         return body
@@ -2086,9 +2172,8 @@ def _define_kernel_helper(
         cpp_definition: Optional[str] = None,
     ):
         if config.triton.autotune_at_compile_time:
-            # Skip inserting comments for the autotune block as they may contain cpp style comments
             body = self._format_kernel_definition(
-                kernel_name, kernel_body, metadata=None
+                kernel_name, kernel_body, metadata=metadata
             )
             self.kernel_autotune_defs.splice(body)
             if V.graph.cpp_wrapper:
@@ -2100,8 +2185,8 @@ def _define_kernel_helper(
         )
         self.header.splice(body)
 
-    def define_subgraph_launcher_fn(self, fn_code: str):
-        self.subgraph_definitions.splice(fn_code)
+    def define_subgraph_launcher_fn(self, name: str, subgraph_code):
+        self.subgraph_definitions.splice(subgraph_code.value)
 
     def define_user_defined_triton_kernel(
         self,
@@ -2385,7 +2470,10 @@ def generate_numel_expr(self, kernel_name: str, tree, suffix: Optional[str] = No
         # constant now, need type info. I agree, this needs type info, and while this is not true type info
         # it suffices as a type hint for the purposes of producing the correct code for this type.
         arg = SymbolicCallArg(sym, tree.numel)
-        self.writeline(SymbolicCallArgLine(self, arg, V.graph))
+
+        is_benchmark_kernel = kernel_name == ""
+        if not is_benchmark_kernel:
+            self.writeline(SymbolicCallArgLine(self, arg, V.graph))
 
         return arg
 
@@ -3290,11 +3378,12 @@ def codegen_subgraph_call(self, subgraph, outer_inputs, outer_buffer_name):
 
     def codegen_subgraph_common(self, subgraph):
         self.push_codegened_graph(subgraph.graph)
-        self.writeline("")
-        self.writeline(f"{self.comment} subgraph: {subgraph.name}")
+        self.make_comment("")
+        self.make_comment(f"{self.comment} subgraph: {subgraph.name}")
 
         parent_graph = V.graph
         subgraph.graph.cpp_wrapper = parent_graph.cpp_wrapper
+        subgraph.graph.fx_wrapper = parent_graph.fx_wrapper
 
         if subgraph.graph.name not in self.already_codegened_subgraphs:
             # If it is already codegened, the parent wrapper already has
@@ -3304,8 +3393,9 @@ def codegen_subgraph_common(self, subgraph):
                 with config.patch("graph_partition", False):
                     # Call the codegen of subgraph recursively
                     subgraph_code, _ = subgraph.graph.codegen()
-            self.already_codegened_subgraphs.add(subgraph.graph.name)
-            self.define_subgraph_launcher_fn(subgraph_code.value)
+            subgraph_name = subgraph.graph.name
+            self.already_codegened_subgraphs.add(subgraph_name)
+            self.define_subgraph_launcher_fn(subgraph_name, subgraph_code)
 
     def codegen_subgraph_with_flattened_outputs(
         self, subgraph, outer_inputs, outer_flattened_outputs
@@ -3337,7 +3427,7 @@ def codegen_invoke_subgraph(self, invoke_subgraph):
         else:
             self.codegen_subgraph(invoke_subgraph.subgraph, outer_inputs, name)
 
-    def codegen_conditional(self, conditional):
+    def codegen_conditional(self, conditional) -> None:
         name = conditional.get_name()
 
         outer_inputs = [buf.codegen_reference() for buf in conditional.operands]
@@ -3572,7 +3662,7 @@ def get_wrapper_call_indent(self) -> int:
 
     def get_graph_inputs(
         self,
-    ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]:
+    ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]:
         if signature := self.partition_signatures:
             inputs = signature.input_nodes | {
                 str(s): s for s in signature.symbol_inputs
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index 29905b11f3b97..0ac2a67aa7cd4 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -23,7 +23,11 @@
 from torch._inductor.codecache import LambdaFuture, PyCodeCache
 from torch._inductor.runtime.triton_heuristics import CachingAutotuner
 from torch._inductor.select_algorithm import extern_kernels  # noqa: F401
-from torch._inductor.utils import convert_shape_to_symint, sympy_product
+from torch._inductor.utils import (
+    convert_shape_to_symint,
+    convert_to_symint,
+    sympy_product,
+)
 from torch._inductor.virtualized import V
 from torch._library.triton import wrap_triton
 from torch.fx import GraphModule
@@ -31,10 +35,11 @@
 from torch.utils._sympy.functions import FloorDiv
 from torch.utils._sympy.interp import _run_sympy_handler, sympy_interp
 from torch.utils._sympy.reference import OptimizedPythonReferenceAnalysis
+from torch.utils._sympy.solve import try_solve
 
 from .. import config, ir
 from ..runtime.triton_compat import Config
-from ..utils import LineContext
+from ..utils import cache_property_on_self, LineContext, ValueWithLineMap
 from .common import (
     CodegenSymbol,
     FileBackedGraphModule,
@@ -47,6 +52,7 @@
     CommBufferAllocateLine,
     CommBufferFreeLine,
     CommentLine,
+    ConditionalLine,
     EnterDeviceContextManagerLine,
     EnterSubgraphLine,
     ExitDeviceContextManagerLine,
@@ -55,6 +61,7 @@
     ExternKernelOutLine,
     FreeIfNotReusedLine,
     FreeLine,
+    IndexPutFallbackLine,
     KernelCallLine,
     KernelDefinitionLine,
     Line,
@@ -63,6 +70,8 @@
     PythonWrapperCodegen,
     ReinterpretLine,
     ReuseLine,
+    ScatterFallbackLine,
+    SubgraphPythonWrapperCodegen,
     SymbolicCallArg,
     SymbolicCallArgLine,
     WrapperLine,
@@ -84,8 +93,10 @@ class SymbolBuffer(CodegenSymbol):
     def get_name(self) -> str:
         return str(self.symbol)
 
-    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
-        return self.symbol
+    def get_example(self) -> Union[torch.Tensor, torch.SymInt]:
+        sym_int = convert_to_symint(self.symbol)
+        assert isinstance(sym_int, torch.SymInt)
+        return sym_int
 
 
 CodegenBuffer = Union[BufferLike, SymbolBuffer]
@@ -105,32 +116,36 @@ def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
     """
     Replace sympy.floor with FloorDiv.
     """
-    expr = sympy.together(expr)
-
-    # Find division operations in the sympy.floor expression
-    # Div is either represented as Mul with:
-    # Rational denominator or Pow with negative exponent
-    if not isinstance(expr, sympy.core.mul.Mul):
-        return sympy.floor(expr)
-
-    if isinstance(expr.args[0], sympy.Rational):
-        frac = expr.args[0]
-        numerator = sympy_product(expr.args[1:]) * frac.numerator
-        denominator = frac.denominator
-
-        return FloorDiv(numerator, denominator)
-    elif isinstance(expr.args[0], sympy.Pow):
-        base = expr.args[0].base
-        exp = expr.args[0].exp
-        numerator = sympy_product(expr.args[1:])
-        if exp < 0:
-            denominator = base ** (-exp)
+
+    def replace(expr: sympy.Expr) -> sympy.Expr:
+        expr = sympy.together(expr)
+
+        # Find division operations in the sympy.floor expression
+        # Div is either represented as Mul with:
+        # Rational denominator or Pow with negative exponent
+        if not isinstance(expr, sympy.core.mul.Mul):
+            return sympy.floor(expr)
+
+        if isinstance(expr.args[0], sympy.Rational):
+            frac = expr.args[0]
+            numerator = sympy_product(expr.args[1:]) * frac.numerator
+            denominator = frac.denominator
+
+            return FloorDiv(numerator, denominator)
+        elif isinstance(expr.args[0], sympy.Pow):
+            base = expr.args[0].base
+            exp = expr.args[0].exp
+            numerator = sympy_product(expr.args[1:])
+            if exp < 0:
+                denominator = base ** (-exp)
+            else:
+                numerator = numerator * (base**exp)
+                denominator = 1
+            return FloorDiv(numerator, denominator)
         else:
-            numerator = numerator * (base**exp)
-            denominator = 1
-        return FloorDiv(numerator, denominator)
-    else:
-        return sympy.floor(expr)
+            return sympy.floor(expr)
+
+    return expr.replace(sympy.floor, replace)
 
 
 class WrapperFxCodegen(PythonWrapperCodegen):
@@ -140,6 +155,54 @@ class WrapperFxCodegen(PythonWrapperCodegen):
 
     supports_caching = False
 
+    def __init__(self, *args: Any, **kwargs: Any):
+        super().__init__(*args, **kwargs)
+        self.subgms: dict[str, torch.fx.GraphModule] = {}
+
+    def codegen_inputs(self) -> None:
+        """
+        This would generate code for symbolic input shapes, strides, etc.
+        Since the FX converter handles this, do nothing here.
+        """
+
+    def codegen_conditional(self, conditional: ir.Conditional) -> None:
+        """
+        Conditional codegen normally emits a number of different wrapper lines.
+        Instead, FX conversion uses a dedicated line for the whole conditional.
+        """
+        self.writeline(ConditionalLine(self, conditional))
+        for subgraph in (conditional.true_subgraph, conditional.false_subgraph):
+            self.codegen_subgraph_common(subgraph)
+
+    def define_subgraph_launcher_fn(
+        self, name: str, subgraph_code: Union[ValueWithLineMap, FileBackedGraphModule]
+    ) -> None:
+        """
+        Record subgms as they're generated.
+        """
+        assert isinstance(subgraph_code, FileBackedGraphModule)
+        self.subgms[name] = subgraph_code.gm
+
+    @property
+    @cache_property_on_self
+    def is_subgraph(self) -> bool:
+        return isinstance(self, SubgraphPythonWrapperCodegen)
+
+    def get_fx_graph_inputs(
+        self,
+    ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]:
+        """
+        Get the input nodes corresponding to FX graph placeholders.
+        """
+        if V.aot_compilation and not self.is_subgraph:
+            # AOT graphs must match the signature of the input module.
+            return {
+                node.name: V.graph.graph_inputs.get(node.name)
+                for node in V.graph.module.graph.find_nodes(op="placeholder")  # type: ignore[operator, union-attr]
+            }
+
+        return self.get_graph_inputs()
+
     def _generate(self, is_inference: bool) -> tuple[FileBackedGraphModule, None]:
         self.run_wrapper_ir_passes(is_inference)
 
@@ -149,7 +212,15 @@ def _generate(self, is_inference: bool) -> tuple[FileBackedGraphModule, None]:
                 self.header.getvalue(),
             ]
         )
-        gm = FxConverter(lines=self.lines, prologue=prologue).generate()
+        gm = FxConverter(
+            lines=self.lines,
+            prologue=prologue,
+            graph_inputs=self.get_fx_graph_inputs(),
+            graph_outputs=self.get_graph_outputs(),
+            subgms=self.subgms,
+            is_subgraph=self.is_subgraph,
+        ).generate()
+
         compiled_fn = self.compile_graph(gm)
 
         return FileBackedGraphModule(gm, compiled_fn), None
@@ -162,20 +233,43 @@ def compile_graph(self, gm: GraphModule) -> Callable[..., Any]:
         """
         return gm.forward
 
+    def write_header(self) -> None:
+        """
+        Python subgraphs normally lack headers.
+        Override this behavior to generate prologues for FX subgraphs.
+        """
+        PythonWrapperCodegen.write_header(self)
+
     @classmethod
     def create(
-        cls,
+        cls: type["WrapperFxCodegen"],
         is_subgraph: bool,
         subgraph_name: Optional[str],
         parent_wrapper: Optional[PythonWrapperCodegen],
         partition_signatures: Optional[ir.GraphPartitionSignature] = None,
     ) -> "WrapperFxCodegen":
         if is_subgraph:
-            raise NotImplementedError(
-                "Subgraphs are not yet supported by FX conversion"
+            assert subgraph_name is not None
+            assert parent_wrapper is not None
+
+            # Subgraphs override some methods of PythonWrapperCodegen.
+            # Apply these overrides to the user-provided class, with priority given to
+            # user-provided methods.
+            class SubgraphFxWrapperCodegen(cls, SubgraphPythonWrapperCodegen):  # type: ignore[misc,valid-type]
+                def compile_graph(self, gm: GraphModule) -> Callable[..., Any]:
+                    """
+                    Skip graph compilation for subgraphs.
+                    """
+
+                    def crash_if_run(*args: Any) -> None:
+                        raise NotImplementedError("Cannot run a subgraph in isolation!")
+
+                    return crash_if_run
+
+            return SubgraphFxWrapperCodegen(
+                subgraph_name, parent_wrapper, partition_signatures
             )
 
-        # For derived backends, this could be a subclass.
         return cls()
 
 
@@ -187,7 +281,11 @@ class FxConverter:
     """
 
     lines: list[Line]
-    prologue: str = ""
+    prologue: str
+    graph_inputs: dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]
+    graph_outputs: list[ir.IRNode]
+    subgms: dict[str, torch.fx.GraphModule]
+    is_subgraph: bool
 
     def __post_init__(self) -> None:
         graph = torch.fx.Graph()
@@ -294,29 +392,40 @@ def _get_buffer(self, node: ir.IRNode) -> CodegenBuffer:
         else:
             raise NotImplementedError(f"Unable to extract buffer from node: {node}")
 
+    def _generate_size_proxy(
+        self, node: torch.fx.Node, expr: sympy.Expr
+    ) -> torch.fx.Proxy:
+        proxy = torch.fx.Proxy(node, tracer=self.tracer)
+        self.expr_to_proxy[expr] = proxy
+        return proxy
+
     def _generate_graph_inputs(self) -> None:
         """
         Converts graph inputs to FX placeholders.
         """
 
-        for node in V.graph.module.graph.find_nodes(op="placeholder"):  # type: ignore[operator, union-attr]
-            name = node.name
-            if name in V.graph.graph_inputs:
-                ir_node = V.graph.graph_inputs[name]
-
-                # Introduce a new symbol for constant inputs.
-                buffer = (
-                    SymbolBuffer(sympy.Symbol(name, is_integer=True))
-                    if isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
-                    else self._get_buffer(ir_node)
-                )
-                placeholder_node = self.gm.graph.placeholder(buffer.get_name())
-                placeholder_node.meta["val"] = buffer.get_example()
-                self._record_allocation(buffer, placeholder_node)
-
-            elif V.aot_compilation:
+        for name, ir_node in self.graph_inputs.items():
+            if ir_node is None:
                 # Create dummy input nodes to match the input signature
                 self.gm.graph.placeholder(name)
+                continue
+
+            # Introduce a new symbol for constant inputs.
+            is_constant = isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
+            buffer = (
+                SymbolBuffer(sympy.Symbol(name, is_integer=True))
+                if is_constant
+                else self._get_buffer(ir_node)
+            )
+            placeholder_node = self.gm.graph.placeholder(buffer.get_name())
+            placeholder_node.meta["val"] = (
+                ir_node if is_constant else buffer.get_example()
+            )
+            self._record_allocation(buffer, placeholder_node)
+
+            # Record symbol definitions for dynamic shapes.
+            if isinstance(ir_node, sympy.Symbol):
+                self._generate_size_proxy(placeholder_node, ir_node)
 
     def _generate_graph_input_shapes(self) -> None:
         """
@@ -330,37 +439,74 @@ def _codegen_symbol(
             target: torch._ops.OpOverload,
             dim: int,
         ) -> None:
+            def codegen_proxy() -> torch.fx.Proxy:
+                size_node = self.gm.graph.call_function(target, (base_node, dim))
+                size_proxy = self._generate_size_proxy(size_node, sym_or_exp)
+                return size_proxy
+
             if isinstance(sym_or_exp, sympy.Symbol):
                 if sym_or_exp in self.expr_to_proxy:
                     return
-
-                size_node = self.gm.graph.call_function(target, (base_node, dim))
-                size_proxy = torch.fx.Proxy(size_node, tracer=self.tracer)
-
-                self.expr_to_proxy[sym_or_exp] = size_proxy
+                codegen_proxy()
 
             elif isinstance(sym_or_exp, sympy.Integer):
                 return
 
             elif isinstance(sym_or_exp, sympy.Expr):
-                self._sympy_interp(sym_or_exp)
-
-        for node in V.graph.module.graph.find_nodes(op="placeholder"):  # type: ignore[operator, union-attr]
-            name = node.name
-            if name in V.graph.graph_inputs:
-                ir_node = V.graph.graph_inputs[name]
-                if isinstance(ir_node, ir.TensorBox):
-                    buffer = self._get_buffer(ir_node)
-                    placeholder_node = self.buffer_to_node[buffer.get_name()]
-
-                    for dim, size in enumerate(ir_node.get_size()):
-                        _codegen_symbol(
-                            size, placeholder_node, torch.ops.aten.sym_size.int, dim
-                        )
-                    for dim, stride in enumerate(ir_node.get_stride()):
-                        _codegen_symbol(
-                            stride, placeholder_node, torch.ops.aten.sym_stride.int, dim
-                        )
+                # Check if we need to solve for an undefined symbol.
+                undefined_symbols = [
+                    sym
+                    for sym in sym_or_exp.free_symbols
+                    if sym not in self.expr_to_proxy
+                ]
+                if len(undefined_symbols) == 0:
+                    self._sympy_interp(sym_or_exp)
+                    return
+                elif len(undefined_symbols) > 1:
+                    raise ValueError(f"Underdetermined input expression: {sym_or_exp}")
+
+                # Define a new symbol for the input size.
+                size_proxy = codegen_proxy()
+                size_symbol = sympy.Symbol(
+                    size_proxy.node.name, integer=True, nonnegative=True
+                )
+                self.expr_to_proxy[size_symbol] = size_proxy
+
+                # Solve for the undefined symbol.
+                undefined_symbol = undefined_symbols[0]
+                solution = try_solve(
+                    sympy.Eq(sym_or_exp, size_symbol), undefined_symbol
+                )
+                if solution is None:
+                    raise ValueError(f"Cannot solve input expression: {sym_or_exp}")
+
+                # Since the symbol is a size, it must be an integer.
+                # Therefore, we can convert division to FloorDiv.
+                undefined_symbol_expr = solution[1]
+                if undefined_symbol.is_integer:
+                    undefined_symbol_expr = replace_floor_div(
+                        sympy.floor(undefined_symbol_expr)
+                    )
+
+                # Generate FX for the symbol.
+                self._sympy_interp(undefined_symbol_expr)
+                self.expr_to_proxy[undefined_symbol] = self.expr_to_proxy[
+                    undefined_symbol_expr
+                ]
+
+        for ir_node in self.graph_inputs.values():
+            if isinstance(ir_node, ir.TensorBox):
+                buffer = self._get_buffer(ir_node)
+                placeholder_node = self.buffer_to_node[buffer.get_name()]
+
+                for dim, size in enumerate(ir_node.get_size()):
+                    _codegen_symbol(
+                        size, placeholder_node, torch.ops.aten.sym_size.int, dim
+                    )
+                for dim, stride in enumerate(ir_node.get_stride()):
+                    _codegen_symbol(
+                        stride, placeholder_node, torch.ops.aten.sym_stride.int, dim
+                    )
 
     def _generate_graph_constants(self) -> None:
         for name, value in V.graph.constants.items():
@@ -375,6 +521,10 @@ def _generate_buffer(self, node: ir.IRNode) -> Optional[torch.fx.Node]:
         Does nothing if no such transformations are present.
         """
 
+        if isinstance(node, ir.ShapeAsConstantBuffer):
+            # Generate FX nodes to compute the shape expression.
+            return self._sympy_interp(node.expr).node
+
         def generate_to_buffer(node: ir.IRNode) -> Optional[BufferLike]:
             if isinstance(node, (ir.Buffer, WorkspaceArg)):
                 return node
@@ -410,19 +560,47 @@ def generate_to_buffer(node: ir.IRNode) -> Optional[BufferLike]:
         buffer = generate_to_buffer(node)
         return self.buffer_to_node[buffer.get_name()] if buffer is not None else None
 
-    def _generate_output(self) -> None:
+    def _generate_outputs(
+        self,
+    ) -> Union[Optional[torch.fx.Node], list[Optional[torch.fx.Node]]]:
         """
         Generate FX IR for graph outputs.
         """
         output_nodes = [
-            self._generate_buffer(node)
-            for idx, node in enumerate(V.graph.graph_outputs)
+            self._generate_buffer(node) for idx, node in enumerate(self.graph_outputs)
         ]
 
-        # Single return elements don't use a tuple.
-        output_value = output_nodes[0] if len(output_nodes) == 1 else output_nodes
+        # Parent graphs with single return elements don't use a tuple.
+        output_value = (
+            output_nodes[0]
+            if len(output_nodes) == 1 and not self.is_subgraph
+            else output_nodes
+        )
+
+        return output_value
 
-        self.gm.graph.output(output_value)
+    def _generate_subgm_getattrs(self) -> None:
+        """
+        Generate getattr nodes for subgms.
+        """
+
+        def generate_getattr(name: str, subgm: torch.fx.GraphModule) -> torch.fx.Node:
+            self.gm.add_submodule(name, subgm)
+            node = self.gm.graph.get_attr(name)
+            node.meta["val"] = subgm
+            return node
+
+        self.subgm_getattrs = {
+            name: generate_getattr(name, subgm) for name, subgm in self.subgms.items()
+        }
+
+    def _get_subgm_attr(self, subgraph: ir.Subgraph) -> torch.fx.Node:
+        """
+        Look up the getattr node for a subgraph.
+        """
+        graph = subgraph.graph
+        assert graph is not None
+        return self.subgm_getattrs[graph.name]
 
     def generate(self) -> torch.fx.GraphModule:
         """
@@ -430,6 +608,7 @@ def generate(self) -> torch.fx.GraphModule:
         """
         self._generate_graph_inputs()
         self._generate_graph_constants()
+        self._generate_subgm_getattrs()
 
         fake_mode = _detect_fake_mode_from_gm(self.gm)
 
@@ -458,7 +637,9 @@ def generate(self) -> torch.fx.GraphModule:
                         )
                     )
 
-        self._generate_output()
+            output = self._generate_outputs()
+
+        self.gm.graph.output(output)
         self.gm.recompile()
         return self.gm
 
@@ -532,6 +713,33 @@ def _generate_allocate(self, line: WrapperLine) -> None:
         node.name = name
         self._record_allocation(buffer, node)
 
+    def _generate_conditional(self, line: WrapperLine) -> None:
+        assert isinstance(line, ConditionalLine)
+
+        def get_subgm_attr(subgraph: Optional[ir.Subgraph]) -> torch.fx.Node:
+            assert subgraph is not None
+            return self._get_subgm_attr(subgraph)
+
+        # Access the subgraphs as getattrs.
+        ir_node = line.node
+        (true_subgm, false_subgm) = [
+            get_subgm_attr(subgraph)
+            for subgraph in (ir_node.true_subgraph, ir_node.false_subgraph)
+        ]
+
+        def generate_buffer(node: Optional[ir.IRNode]) -> Optional[torch.fx.Node]:
+            assert node is not None
+            return self._generate_buffer(node)
+
+        predicate = generate_buffer(ir_node.predicate)
+        assert ir_node.operands is not None
+        operands = tuple(generate_buffer(arg) for arg in ir_node.operands)
+        fx_node = self.gm.graph.call_function(
+            torch.ops.higher_order.cond,
+            args=(predicate, true_subgm, false_subgm, operands),
+        )
+        self._record_allocation(ir_node, fx_node)
+
     def _generate_comment(self, line: WrapperLine) -> None:
         assert isinstance(line, CommentLine)
         # We ignore comments in FX IR.
@@ -546,11 +754,11 @@ def _generate_exit_device_context_manager(self, line: WrapperLine) -> None:
 
     def _generate_enter_subgraph(self, line: WrapperLine) -> None:
         assert isinstance(line, EnterSubgraphLine)
-        raise NotImplementedError("Subgraphs are not yet supported by FX conversion")
+        # We ignore memory planning lines in FX IR.
 
     def _generate_exit_subgraph(self, line: WrapperLine) -> None:
         assert isinstance(line, ExitSubgraphLine)
-        raise NotImplementedError("Subgraphs are not yet supported by FX conversion")
+        # We ignore memory planning lines in FX IR.
 
     def _generate_free(self, line: WrapperLine) -> None:
         assert isinstance(line, FreeLine)
@@ -653,6 +861,56 @@ def _generate_multi_output(self, line: WrapperLine) -> None:
         node.name = line.result_name
         self.buffer_to_node[line.result_name] = node
 
+    def _generate_fallback_call(
+        self,
+        ir_node: ir.ExternKernel,
+        args: Optional[tuple[Any, ...]] = None,
+        kwargs: Optional[dict[str, Any]] = None,
+    ) -> None:
+        fx_node = self.gm.graph.call_function(
+            ir_node.op_overload,  # type: ignore[arg-type]
+            args=args,
+            kwargs=kwargs,
+        )
+        result_buffer = ir_node.codegen_reference()
+        self.buffer_to_node[result_buffer] = fx_node
+
+    def _generate_index_put_fallback(self, line: WrapperLine) -> None:
+        assert isinstance(line, IndexPutFallbackLine)
+        ir_node = line.node
+
+        def generate_buffer_or_none(
+            x: Union[ir.IRNode, Sequence[ir.IRNode], None],
+        ) -> Optional[torch.fx.Node]:
+            """
+            Handles None before calling _generate_buffer.
+            """
+            if x is None:
+                return None
+
+            assert isinstance(x, ir.IRNode)
+            return self._generate_buffer(x)
+
+        (x, values) = [generate_buffer_or_none(t) for t in ir_node.inputs[:2]]
+        indices = tuple(generate_buffer_or_none(t) for t in line.indices)
+        accumulate = ir_node.constant_args[0]
+        args = (x, indices, values, accumulate)
+        self._generate_fallback_call(ir_node, args)
+
+    def _generate_scatter_fallback(self, line: WrapperLine) -> None:
+        assert isinstance(line, ScatterFallbackLine)
+        ir_node = line.node
+        assert ir.is_node_sequence(ir_node.inputs)
+        (x, index, src) = [self._generate_buffer(t) for t in ir_node.inputs] + (
+            [] if ir_node.src_is_tensor else [ir_node.constant_args[1]]
+        )
+        args = (x, ir_node.constant_args[0], index, src)
+        kwargs = {}
+        if reduce := ir_node.kwargs.get("reduce"):
+            kwargs["reduce"] = reduce
+
+        self._generate_fallback_call(ir_node, args, kwargs)
+
     def _generate_null(self, line: WrapperLine) -> None:
         assert isinstance(line, NullLine)
         # Does nothing.
@@ -751,12 +1009,7 @@ def add_constants_to_call_args(
 
         # Replace all sympy.floor with FloorDiv
         # _generate_sym_node does not support sympy.floor
-        grid = [
-            x.replace(sympy.floor, replace_floor_div)
-            if isinstance(x, sympy.Expr)
-            else x
-            for x in grid
-        ]
+        grid = [replace_floor_div(x) if isinstance(x, sympy.Expr) else x for x in grid]
         wrapper_grid = [tuple(self._generate_sym_nodes(grid))]
         call_kwargs = {
             name: self._generate_sym_node(val) for name, val in call_kwargs.items()
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 9e46613300456..8520831750bff 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -370,9 +370,12 @@ def find_smallest_i(graph: fx.Graph, prefix: str) -> int:
                 ):
                     continue
             elif (
-                torch.equal(gm_target, model_target)
+                gm_target.device == model_target.device
                 and gm_target.dtype == model_target.dtype
+                and torch.equal(gm_target, model_target)
             ):
+                # If tensors with same name from gm and model are indeed the same, we don't need to rename
+                # Check device first, to avoid torch.equal(wrapper_CUDA__equal) raise when different device
                 continue
 
             prefix = (
@@ -1227,7 +1230,9 @@ def codegen_and_compile(
             # structured logs...
             # trace_structured("inductor_input_graph", payload_fn=lambda: gm.print_readable(print_output=False))
 
-            shape_env = shape_env_from_inputs(example_inputs)
+            shape_env = gm.shape_env
+            if shape_env is None:
+                shape_env = shape_env_from_inputs(example_inputs)
 
             # Convert view to reshape in the graph. This is necessary primarily for
             # layout optimization. Do it unconditionally for uniformity.
@@ -1321,11 +1326,7 @@ def codegen_and_compile(
 
                 metrics_context = get_metrics_context()
                 if metrics_context.in_progress():
-                    # TODO: Remove this when 3.9 is no longer supported
-                    if sys.version_info < (3, 10):
-                        num_graph_breaks = sum(counters["graph_break"].values())
-                    else:
-                        num_graph_breaks = counters["graph_break"].total()
+                    num_graph_breaks = counters["graph_break"].total()
                     CompileEventLogger.compilation_metric(
                         overwrite=True, num_graph_breaks=num_graph_breaks
                     )
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index f6921a057ba0f..ec8dbdb1abb15 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -391,6 +391,14 @@ def prologue_fusion_enabled() -> bool:
 
 # enable operator reordering for peak memory optimization
 reorder_for_peak_memory = True
+reorder_for_peak_memory_debug = False
+
+# In some cases, when all the nodes that can be scheduled are quite large,
+# it is beneficial to switch the scheduling strategy. So instead of using
+# size as the criterion, we choose a node that can unlock more nodes to
+# become schedulable by analyzing their successor nodes. The default value
+# is zero, which turns off this optimization.
+size_threshold_for_succ_based_strategy: int = 0
 
 reorder_iterative_debug_memory_recompute: bool = False
 reorder_iterative_debug_limit_to_reorder: Optional[int] = (
@@ -459,12 +467,19 @@ def prologue_fusion_enabled() -> bool:
     == "1"
 )
 
+# Disable triton from trying to initialize and detect devices on the host
+triton_disable_device_detection = (
+    os.environ.get("TORCHINDUCTOR_TRITON_DISABLE_DEVICE_DETECTION", "0") == "1"
+)
+
 # enable inductor graph partition to allow multiple inductor graphs for the same dynamo graph
 graph_partition: bool = (
     os.environ.get("TORCHINDUCTOR_GRAPH_PARTITION", "1" if not is_fbcode() else "0")
     == "1"
 )
 
+# whether template autotuning should allow flexible layouts if possible (e.g. only extern choices)
+max_autotune_allow_flexible_layouts: bool = False
 
 # force cublas and triton to use the same precision; cublas supports TF32 for matmul operations
 # when m, n, k are multiples of 16, 16, 8, whereas triton supports TF32 for matmul operations
@@ -634,7 +649,10 @@ def use_autoheuristic(name: str) -> bool:
 benchmark_fusion: bool = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
 enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
 loop_ordering_after_fusion: bool = (
-    os.environ.get("TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0") == "1"
+    os.environ.get(
+        "TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0" if is_fbcode() else "1"
+    )
+    == "1"
 )
 
 # If fusing two nodes only save less then score_fusion_memory_threshold memory,
@@ -1101,7 +1119,7 @@ class cpp:
     simdlen: Optional[int] = None
     min_chunk_size = int(os.environ.get("TORCHINDUCTOR_CPP_MIN_CHUNK_SIZE", "512"))
 
-    cxx: tuple[Literal[None], str] = (
+    cxx: tuple[None, str] = (
         None,  # download gcc12 from conda-forge if conda is installed
         os.environ.get("CXX", "clang++" if sys.platform == "darwin" else "g++"),
     )  # type: ignore[assignment]
@@ -1245,7 +1263,7 @@ class triton:
 
     # Warn loudly when the number of cudagraphs due to dynamic shape
     # exceeds this limit
-    cudagraph_dynamic_shape_warn_limit: Optional[int] = 50
+    cudagraph_dynamic_shape_warn_limit: Optional[int] = 8
 
     # synchronize after cudagraph invocation
     force_cudagraph_sync = False
@@ -1422,6 +1440,11 @@ class triton:
     enable_persistent_tma_matmul = (
         os.environ.get("ENABLE_PERSISTENT_TMA_MATMUL", "0") == "1"
     )
+    # Should TMA store be enable from templates. TODO: Remove once we
+    # can autotune over the result.
+    enable_template_tma_store = os.environ.get("ENABLE_TEMPLATE_TMA_STORE", "0") == "1"
+    # Use epilogue subtiling. We allow disabling it due to limited B200 testing.
+    enable_epilogue_subtiling = os.environ.get("ENABLE_EPILOGUE_SUBTILING", "1") == "1"
     # Skip L1 cache for buffers that are used only once.  Disabled by default
     skip_l1_cache = os.environ.get("TORCHINDUCTOR_SKIP_L1", "0") == "1"
 
@@ -1442,6 +1465,9 @@ class triton:
         os.environ.get("TORCHINDUCTOR_DECOMPOSE_K_THRESHOLD", "32")
     )
 
+    # Programmatic Dependent Launch improves launch latency on Nvidia Hopper+ devices
+    enable_pdl = False
+
 
 class aot_inductor:
     """
@@ -1580,6 +1606,17 @@ class aot_inductor:
     # Whether to enable link-time-optimization
     enable_lto = os.environ.get("AOT_INDUCTOR_ENABLE_LTO", "0") == "1"
 
+    # Whether the compiled .so should link to libtorch
+    # TODO: should consolidate this flag with compile_standalone
+    link_libtorch: bool = True
+
+    # If None, the default torch headers such as torch/include
+    # will be used. Otherwise, the provided path will be used instead.
+    # This is needed for torchnative to load libtorch-free .so.
+    # Such as [f"{torchnative_dir}/standalone",f"{torchnative_dir}/",].
+    # TODO: should consolidate this flag with compile_standalone
+    libtorch_free_headers: Optional[list[str]] = None
+
 
 class cuda:
     """Settings for cuda backend, today this consists of cutlass"""
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
index e2cb445ed1080..f7cb79c9c095a 100644
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@@ -6,6 +6,7 @@
 import errno
 import functools
 import json
+import locale
 import logging
 import os
 import platform
@@ -67,7 +68,7 @@ def use_global_cache() -> bool:  # type: ignore[misc]
 _IS_MACOS = sys.platform.startswith("darwin")
 _IS_WINDOWS = sys.platform == "win32"
 
-SUBPROCESS_DECODE_ARGS = ("utf-8",) if _IS_WINDOWS else ()
+SUBPROCESS_DECODE_ARGS = (locale.getpreferredencoding(),) if _IS_WINDOWS else ()
 
 log = logging.getLogger(__name__)
 
@@ -572,7 +573,7 @@ def _run_compile_cmd(cmd_line: str, cwd: str) -> None:
             cmd, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
         )
     except subprocess.CalledProcessError as e:
-        output = e.stdout.decode("utf-8")
+        output = e.stdout.decode(*SUBPROCESS_DECODE_ARGS)
         openmp_problem = "'omp.h' file not found" in output or "libomp" in output
         if openmp_problem and sys.platform == "darwin":
             instruction = (
@@ -1086,13 +1087,17 @@ def _get_torch_related_args(
 ) -> tuple[list[str], list[str], list[str]]:
     from torch.utils.cpp_extension import include_paths, TORCH_LIB_PATH
 
-    include_dirs = include_paths()
-    libraries_dirs = [TORCH_LIB_PATH]
     libraries = []
-    if sys.platform != "darwin" and not config.is_fbcode():
-        libraries = ["torch", "torch_cpu"]
-        if not aot_mode:
-            libraries.append("torch_python")
+    include_dirs = config.aot_inductor.libtorch_free_headers or include_paths()
+
+    if config.aot_inductor.link_libtorch:
+        libraries_dirs = [TORCH_LIB_PATH]
+        if sys.platform != "darwin" and not config.is_fbcode():
+            libraries.extend(["torch", "torch_cpu"])
+            if not aot_mode:
+                libraries.append("torch_python")
+    else:
+        libraries_dirs = []
 
     if _IS_WINDOWS:
         libraries.append("sleef")
@@ -1341,6 +1346,15 @@ def get_mmap_self_macro(use_mmap_weights: bool) -> list[str]:
     return macros
 
 
+def get_caching_allocator_macro() -> list[str]:
+    from torch._inductor import config
+
+    macros = []
+    if config.aot_inductor.weight_use_caching_allocator:
+        macros.append(" AOT_INDUCTOR_USE_CACHING_ALLOCATOR")
+    return macros
+
+
 def get_cpp_torch_options(
     cpp_compiler: str,
     vec_isa: VecISA,
@@ -1397,6 +1411,7 @@ def get_cpp_torch_options(
     fb_macro_passthrough_args = _use_fb_internal_macros()
 
     mmap_self_macros = get_mmap_self_macro(use_mmap_weights)
+    caching_allocator_macros = get_caching_allocator_macro()
 
     definitions = (
         torch_cpp_wrapper_definitions
@@ -1404,6 +1419,7 @@ def get_cpp_torch_options(
         + isa_macros
         + fb_macro_passthrough_args
         + mmap_self_macros
+        + caching_allocator_macros
     )
     include_dirs = (
         sys_libs_include_dirs
@@ -1562,21 +1578,26 @@ def get_cpp_torch_device_options(
     _set_gpu_runtime_env()
     from torch.utils import cpp_extension
 
-    include_dirs = cpp_extension.include_paths(device_type)
-    libraries_dirs = cpp_extension.library_paths(device_type)
-    if not config.is_fbcode():
+    include_dirs = cpp_extension.include_paths(
+        device_type, config.aot_inductor.link_libtorch is None
+    )
+    link_libtorch = config.aot_inductor.link_libtorch
+    libraries_dirs = cpp_extension.library_paths(
+        device_type, torch_include_dirs=link_libtorch
+    )
+    if not config.is_fbcode() and link_libtorch:
         libraries += ["c10"]
     if device_type == "cuda":
         definitions.append(" USE_ROCM" if torch.version.hip else " USE_CUDA")
 
         if torch.version.hip is not None:
-            if config.is_fbcode():
+            if config.is_fbcode() or not link_libtorch:
                 libraries += ["amdhip64"]
             else:
                 libraries += ["c10_hip", "torch_hip"]
             definitions.append(" __HIP_PLATFORM_AMD__")
         else:
-            if config.is_fbcode():
+            if config.is_fbcode() or not link_libtorch:
                 libraries += ["cuda"]
             else:
                 libraries += ["c10_cuda", "cuda", "torch_cuda"]
@@ -1594,15 +1615,16 @@ def get_cpp_torch_device_options(
                 raise OSError(xpu_error_string)
             include_dirs += [os.path.join(ze_root, "include")]
             libraries_dirs += [os.path.join(ze_root, "lib")]
-            libraries += ["c10_xpu", "sycl", "ze_loader", "torch_xpu"]
         else:
             # Suppress multi-line comment warnings in sycl headers
             cflags += ["Wno-comment"]
-            libraries += ["c10_xpu", "sycl", "ze_loader", "torch_xpu"]
-
             if not find_library("ze_loader"):
                 raise OSError(xpu_error_string)
 
+        libraries += ["ze_loader", "sycl"]
+        if link_libtorch:
+            libraries += ["c10_xpu", "torch_xpu"]
+
     if device_type == "mps":
         definitions.append(" USE_MPS")
 
@@ -1689,7 +1711,9 @@ def __init__(
             device_libraries,
             device_passthrough_args,
         ) = get_cpp_torch_device_options(
-            device_type=device_type, aot_mode=aot_mode, compile_only=compile_only
+            device_type=device_type,
+            aot_mode=aot_mode,
+            compile_only=compile_only,
         )
         _append_list(self._definitions, device_definitions)
         _append_list(self._include_dirs, device_include_dirs)
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index eebe6c974e173..7781d13dab178 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -460,6 +460,12 @@ def add(
     if not x_is_complex_tensor or not y_is_complex_tensor:
         return NotImplemented
 
+    def _requires_fallback(tensor: torch.Tensor) -> bool:
+        if tensor.ndim == 0:
+            return False
+        # Viewing complex tensors as their real dtype requires the last stride to be 1.
+        return tensor.stride()[-1] != 1
+
     output_size_zero = False
     if x.ndim == 0 and y.ndim == 0:
         output_size_zero = True
@@ -474,6 +480,9 @@ def add(
         z = alpha * y
     complex_type = torch.promote_types(x.dtype, y.dtype)
 
+    if _requires_fallback(x) or _requires_fallback(z):
+        return NotImplemented
+
     # For complex typed `x`, `x.view(x.real.dtype)` doubles the last dimension and can cause problem
     # when broadcasting the add.
     def reshape_tensor_complex(tensor: torch.Tensor) -> torch.Tensor:
@@ -579,7 +588,7 @@ def view_copy_dtype(
 def _get_shape_permutation_like(
     self: torch.Tensor,
 ) -> tuple[utils.ShapeType, utils.StrideType]:
-    physical_layout = utils.compute_elementwise_output_logical_to_physical_perm(self)
+    physical_layout, _ = utils.compute_elementwise_output_logical_to_physical_perm(self)
     shape = [self.shape[l] for l in physical_layout]
 
     permutation = [0] * len(shape)
@@ -1172,3 +1181,45 @@ def repeat_interleave_Tensor(
     return torch.searchsorted(
         cumsum, pos, out_int32=(repeat.dtype == torch.int32), right=True
     )
+
+
+# intentionally not regiestered
+def conv1d_to_conv2d(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    stride: tuple[int] = (1,),
+    padding: tuple[int] = (0,),
+    dilation: tuple[int] = (1,),
+    groups: int = 1,
+) -> torch.Tensor:
+    # Shapes:
+    # input:  (N, C_in, L_in)
+    # weight: (C_out, C_in // groups, K)
+    # bias:   (C_out,)
+    assert input.dim() == 3 and weight.dim() == 3, (
+        "Expect (N,C_in,L) and (C_out,C_in//groups,K)"
+    )
+
+    stride = stride[0]
+    padding = padding[0]
+    dilation = dilation[0]
+
+    # Unsqueeze to make input 2D: (N,C,L) -> (N,C,L,1)
+    input_2d = input.unsqueeze(-1)
+    # Unsqueeze kernel: (C_out,C_in/groups,K) -> (C_out,C_in/groups,K,1)
+    weight_2d = weight.unsqueeze(-1)
+
+    # Call conv2d with adjusted args
+    out_2d = aten.conv2d.default(
+        input_2d,
+        weight_2d,
+        bias,
+        stride=(stride, 1),
+        padding=(padding, 0),
+        dilation=(dilation, 1),
+        groups=groups,
+    )
+
+    # Squeeze dummy dimension back out: (N,C_out,L_out,1) -> (N,C_out,L_out)
+    return out_2d.squeeze(-1)
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 835ea182f8e80..21aad41c2c0c7 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -22,7 +22,6 @@
     get_dtype_size,
     reduction_num_outputs,
     sympy_index_symbol,
-    sympy_str,
     sympy_subs,
     VarRanges,
 )
@@ -554,26 +553,23 @@ def canonicalize(
         }
         return self._normalize(index, var_ranges)
 
-    def load(self, name: str, index: sympy.Expr) -> str:
+    def load(self, name: str, index: sympy.Expr) -> None:
         self._reads.add(MemoryDep(name, *self.canonicalize(index)))
-        return f"load({name}, {sympy_str(index)})"
 
-    def load_seed(self, name: str, index: int) -> str:
+    def load_seed(self, name: str, index: int) -> None:
         assert isinstance(index, int)
-        return self.load(name, sympy.Integer(index))
+        self.load(name, sympy.Integer(index))
 
     def store(
         self, name: str, index: sympy.Expr, value: str, mode: Optional[str] = None
-    ) -> str:
+    ) -> None:
         self._writes.add(MemoryDep(name, *self.canonicalize(index), mode=mode))
-        return f"store({name}, {sympy_str(index)}, {value}, {mode})"
 
-    def store_reduction(self, name: str, index: sympy.Expr, value: str) -> str:
-        return self.store(name, index, f"store_reduction({value})")
+    def store_reduction(self, name: str, index: sympy.Expr, value: str) -> None:
+        self.store(name, index, f"store_reduction({value})")
 
-    def index_expr(self, index: sympy.Expr, dtype: Optional[torch.dtype]) -> str:
+    def index_expr(self, index: sympy.Expr, dtype: Optional[torch.dtype]) -> None:
         self._index_exprs.add(IndexExprDep(*self.canonicalize(index)))
-        return f"index_expr({sympy_str(index)}, {dtype})"
 
     def bucketize(
         self,
diff --git a/torch/_inductor/fuzzer.py b/torch/_inductor/fuzzer.py
index 8149bc7e98e79..b0c89caa89c34 100644
--- a/torch/_inductor/fuzzer.py
+++ b/torch/_inductor/fuzzer.py
@@ -5,7 +5,6 @@
 import random
 import signal
 import string
-import sys
 import traceback
 from collections.abc import KeysView, Sequence
 from enum import Enum
@@ -610,9 +609,6 @@ def __init__(
             sm: How type value samples are generated, default TOGGLE.
             test_timeout: max time a test can take.
         """
-        if sys.version_info < (3, 10):
-            log.error("Only python 3.10 and later supported")
-            return
         self.seed = seed
         self.test_timeout = test_timeout
         self.detailed_results: dict[ComboType, dict[str, Any]] = {}
diff --git a/torch/_inductor/fx_passes/b2b_gemm.py b/torch/_inductor/fx_passes/b2b_gemm.py
index ff434ccba0952..a87c86fe9e52f 100644
--- a/torch/_inductor/fx_passes/b2b_gemm.py
+++ b/torch/_inductor/fx_passes/b2b_gemm.py
@@ -123,7 +123,7 @@ def b2b_gemm_grid(M, P, meta, *, cdiv):
     idx_p = offs_p[None, :]
     out_mask = (idx_m < M) & (idx_p < P)
 
-    {{store_output(("idx_m", "idx_p"), "acc", "out_mask")}}
+    {{store_output(("idx_m", "idx_p"), "acc", "out_mask", val_shape=("BLOCK_SIZE_M", "BLOCK_SIZE_P"))}}
 """,
 )
 
@@ -205,7 +205,7 @@ def b2b_gemm_grid(M, P, meta, *, cdiv):
     idx_p = offs_p[None, :]
     out_mask = (idx_m < M) & (idx_p < P)
 
-    {{store_output(("idx_m", "idx_p"), "acc", "out_mask")}}
+    {{store_output(("idx_m", "idx_p"), "acc", "out_mask", val_shape=("BLOCK_SIZE_M", "BLOCK_SIZE_P"))}}
 """,
 )
 
diff --git a/torch/_inductor/fx_passes/freezing_patterns.py b/torch/_inductor/fx_passes/freezing_patterns.py
index f05048a85e0e7..26256b5504d79 100644
--- a/torch/_inductor/fx_passes/freezing_patterns.py
+++ b/torch/_inductor/fx_passes/freezing_patterns.py
@@ -160,13 +160,6 @@ def check_int8_woq_concat_linear_weights(match):
         ):
             return False
 
-        equal_shape_inputs = [weight_inputs]
-        for equal_shape_group in equal_shape_inputs:
-            inps = [match.kwargs[name] for name in equal_shape_group]
-            if not all(
-                inp.meta["val"].shape == inps[0].meta["val"].shape for inp in inps
-            ):
-                return False
         return True
 
     def check_concat_weights(match):
@@ -205,7 +198,8 @@ def int8_woq_fusion_replacement(inp, w1, w2, w3, s1, s2, s3):
         cat_w = torch.cat((w1, w2, w3), dim=1)
         cat_s = torch.cat((s1, s2, s3), dim=0)
         mm = (inp @ cat_w).mul(cat_s)
-        return mm.chunk(3, dim=1)
+        n1, n2 = w1.size(1), w2.size(1)
+        return mm.tensor_split([n1, n1 + n2], dim=-1)
 
     register_replacement(
         int8_woq_fusion_pattern,
diff --git a/torch/_inductor/fx_passes/group_batch_fusion.py b/torch/_inductor/fx_passes/group_batch_fusion.py
index 3f8ebe0a7d57d..f081374585ee5 100644
--- a/torch/_inductor/fx_passes/group_batch_fusion.py
+++ b/torch/_inductor/fx_passes/group_batch_fusion.py
@@ -1141,6 +1141,12 @@ def __init__(self, **kwargs):
         super().__init__(torch.clamp, **kwargs)
 
 
+@register_fusion("batch_dropout")
+class BatchDropoutPreGradFusion(BatchMathOpsPreGradFusion):
+    def __init__(self, **kwargs):
+        super().__init__(torch.nn.functional.dropout, **kwargs)
+
+
 @register_fusion("batch_aten_tanh", pre_grad=False)
 class BatchTanhPostGradFusion(BatchPointwiseOpsPostGradFusion):
     def __init__(self, **kwargs) -> None:
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
index 01f62bdf608ce..8820323b3def2 100644
--- a/torch/_inductor/fx_passes/quantization.py
+++ b/torch/_inductor/fx_passes/quantization.py
@@ -1104,12 +1104,6 @@ def fn(match):
         w1_cols = match.kwargs["w1"].meta["val"].size()[0]
         w2_cols = match.kwargs["w2"].meta["val"].size()[0]
         w3_cols = match.kwargs["w3"].meta["val"].size()[0]
-        # Technically, the shapes of the three weights need not be equal.
-        # But currently, we only enable replacement in this case.
-        if w1_cols != w2_cols or w2_cols != w3_cols:
-            return False
-        if 3 * w1_cols != num_scales:
-            return False
         return (
             # For now, we only support woq mm kernels
             # with x.type=bfloat16 and w.type=int8
@@ -1118,13 +1112,12 @@ def fn(match):
             and w2.dtype == torch.int8
             and w3.dtype == torch.int8
             and scales.dtype == torch.bfloat16
-            # _weight_int8pack_mm kernel only supports cpu now
-            # TODO: add cuda kernel support instead of calling mul+sum
-            and x.device.type == "cpu"
+            and x.device.type in ("cpu", "cuda")
             and x.device == w1.device
             and w1.device == w2.device
             and w2.device == w3.device
             and x.device == scales.device
+            and num_scales == w1_cols + w2_cols + w3_cols
         )
 
     return fn
@@ -1146,9 +1139,7 @@ def fn(match):
             x.dtype == torch.bfloat16
             and weight.dtype == torch.int8
             and scales.dtype == torch.bfloat16
-            # _weight_int8pack_mm kernel only supports cpu now
-            # TODO: add cuda kernel support instead of calling mul+sum
-            and x.device.type == "cpu"
+            and x.device.type in ("cpu", "cuda")
             and x.device == weight.device
             and x.device == scales.device
         )
@@ -1164,7 +1155,7 @@ def _register_concat_linear_int8_woq_lowering(
         extra_check=_is_valid_concat_linear_int8_woq_optimization_pattern(),
         pass_number=4,
     )
-    def woq(match: Match, *args, **kwargs):
+    def woq_int8(match: Match, *args, **kwargs):
         x = kwargs["x"]
         w1 = kwargs["w1"]
         w2 = kwargs["w2"]
@@ -1220,7 +1211,7 @@ def woq(match: Match, *args, **kwargs):
             match.graph.erase_node(cat_wgt_node)
             match.graph.lint()
 
-    return woq
+    return woq_int8
 
 
 def _register_woq_lowering(pattern, computation_woq, computation_reshape):
@@ -1228,7 +1219,7 @@ def _register_woq_lowering(pattern, computation_woq, computation_reshape):
         pattern,
         extra_check=_is_valid_woq_optimization_pattern(),
     )
-    def woq(match: Match, *args, **kwargs):
+    def woq_int8(match: Match, *args, **kwargs):
         x = kwargs["x"]
         weight = kwargs["weight"]
         scales = kwargs["scales"]
@@ -1244,7 +1235,7 @@ def woq(match: Match, *args, **kwargs):
         func2 = L[computation_woq](func1, weight, scales)
         return L[computation_reshape](func2, out_shape)
 
-    return woq
+    return woq_int8
 
 
 def _register_woq_mm_int8_pattern1():
@@ -3876,7 +3867,7 @@ def quant_lift_up(graph_module: torch.fx.GraphModule):
         ADD
       SOFTMAX
 
-    We want to lift up the the quant nodes from matmul before view like nodes
+    We want to lift up the quant nodes from matmul before view like nodes
     as the output of Linear node.
 
              DQ
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index d10dc7a464261..dd158af035651 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -44,6 +44,7 @@
     SymTypes,
 )
 from torch.fx.node import Node
+from torch.fx.passes.reinplace import _is_view_op
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.numbers import int_oo
@@ -203,6 +204,30 @@ def get_user_visible_output_strides(g: Graph) -> dict[Node, tuple[int, ...]]:
     return ret
 
 
+def extend_user_visible_output_strides(
+    user_visible_outputs: dict[Node, tuple[int, ...]],
+) -> dict[Node, object]:
+    """
+    Extend user_visible_output_strides to include view ops that lead to user-visible outputs.
+    """
+    result: dict[Node, object] = {**user_visible_outputs}
+    queue = [*result.keys()]
+    visited = OrderedSet([*queue])
+    while queue:
+        current = queue.pop()
+        if (
+            _is_view_op(current.target)
+            and current.args
+            and isinstance(current.args[0], torch.fx.Node)
+        ):
+            base = current.args[0]
+            if base not in visited:
+                result.setdefault(base, None)
+                visited.add(base)
+                queue.append(base)
+    return result
+
+
 def mark_nodes_dislike_padding(
     g: Graph, user_visible_output_strides: dict[Node, tuple[int, ...]]
 ) -> None:
@@ -216,6 +241,10 @@ def mark_nodes_dislike_padding(
     """
     if not config.comprehensive_padding:
         return
+
+    extended_user_visible_nodes = extend_user_visible_output_strides(
+        user_visible_output_strides
+    )
     ops_dislike_padding = OrderedSet(
         [
             aten.convolution,
@@ -284,7 +313,7 @@ def _get_overload_packet(
                 if prior_op not in ops_like_padding:
                     prior.meta["dislike_padding"] = True
         # We only want to mark output nodes. So, move it after the above prior nodes process.
-        if not config.pad_outputs and cur in user_visible_output_strides:
+        if not config.pad_outputs and cur in extended_user_visible_nodes:
             cur.meta["dislike_padding"] = True
 
 
@@ -2390,7 +2419,10 @@ def _compile_to_module_lines(
         else:
             trace_structured(
                 "inductor_output_code",
-                lambda: {"filename": path},
+                lambda: {
+                    "filename": path,
+                    "file_path": os.path.abspath(path),
+                },
                 payload_fn=lambda: wrapper_code.value,
             )
         with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index eb490dd1c6e59..e91954b5f42af 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -71,7 +71,7 @@
 )
 from torch.fx.node import Node
 from torch.utils._ordered_set import OrderedSet
-from torch.utils._sympy.functions import CleanDiv, FloorDiv, ModularIndexing
+from torch.utils._sympy.functions import CleanDiv, FloorDiv, Mod, ModularIndexing
 from torch.utils._sympy.symbol import SymT
 
 from . import config, dependencies
@@ -352,7 +352,7 @@ def get_stride_order(
 
 
 @overload
-def ir_node_to_tensor(x: Literal[None], guard_shape: bool = True) -> None: ...
+def ir_node_to_tensor(x: None, guard_shape: bool = True) -> None: ...
 
 
 @overload
@@ -433,10 +433,7 @@ def is_cpu(x: Union[IRNode, torch.device, None, str]) -> bool:
     return get_device_type(x) == "cpu"
 
 
-def is_aligned_realized_tensor_hint(
-    x: Union[Buffer, TensorBox], alignment: int
-) -> bool:
-    # Use this as a hint. This won't guard since size_hint doesn't guard.
+def is_aligned_realized_tensor(x: Union[Buffer, TensorBox], alignment: int) -> bool:
     if (
         not isinstance(x, IRNode)
         or x.maybe_get_stride() is None
@@ -445,16 +442,16 @@ def is_aligned_realized_tensor_hint(
     ):
         return False
 
-    aligned_strides = all(
-        (V.graph.sizevars.size_hint_or_throw(x.get_stride()[i]) % alignment) == 0
-        for i in range(len(x.get_stride()) - 1)
+    aligned_strides = sympy.And(
+        *(sympy.Eq(Mod(s, alignment), 0) for s in x.get_stride()[:-1])
     )
-    # if the last dim size is <= 1, stride doesn't matter
-    aligned_last_dim = (
-        V.graph.sizevars.size_hint_or_throw(x.get_stride()[-1]) == 1
-        or V.graph.sizevars.size_hint_or_throw(x.get_size()[-1]) <= 1
+    aligned_last_dim = sympy.Or(
+        sympy.Eq(x.get_stride()[-1], 1), sympy.Le(x.get_size()[-1], 1)
     )
-    return aligned_last_dim and aligned_strides
+    is_aligned = sympy.And(aligned_strides, aligned_last_dim)
+
+    # Make sure to guard to recompile when necessary.
+    return V.graph.sizevars.guard_or_false(is_aligned)
 
 
 def significant_strides_equal(
@@ -3744,6 +3741,15 @@ def _pad_strides(
         ):
             return in_strides
 
+        # Skip padding the strides for dynamic shapes based on config.pad_dynamic_shape
+        # Checking both shape and strides, as there are cases where only one is dynamic
+        is_dynamic = not all(
+            isinstance(s, (int, sympy.Integer))
+            for s in itertools.chain(in_strides, size)
+        )
+        if not config.pad_dynamic_shapes and is_dynamic:
+            return in_strides
+
         shape_env = V.graph._shape_env if hasattr(V.graph, "_shape_env") else None
 
         def contains_unbacked_symints(expr: sympy.Expr | int) -> bool:
@@ -4223,7 +4229,7 @@ def get_name(self) -> str:
         assert self.name, self
         return self.name
 
-    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+    def get_example(self) -> Union[torch.Tensor, torch.SymInt]:
         if isinstance(self.layout, Layout):
             return self.layout.get_example()
         raise NotImplementedError(type(self.layout).__name__)
@@ -4972,6 +4978,11 @@ def __init__(
         # An additional description used to describe the choice (useful for
         # knowing what autotuning is choosing)
         self.description = description
+        self.failed: bool = False
+        # A place to store annotations that can be read post benchmarking
+        # Use this to shuttle information between ChoieCaller generation
+        # and the end of benchmarking
+        self.annotations: dict[Any, Any] = {}
 
     def benchmark(self, *args: Any, out: torch.Tensor) -> float:
         algo = self.to_callable()
@@ -5009,6 +5020,14 @@ def info_dict(self) -> dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType
     def autoheuristic_id(self) -> str:
         return "unsupported_choice"
 
+    def mark_failed(self) -> None:
+        """
+        Mark the choice as failed so that it can be
+        removed later. Useful for when we decouple
+        compilation and tuning.
+        """
+        self.failed = True
+
 
 class TritonTemplateCallerBase(ChoiceCaller):
     def get_make_kernel_render(self) -> Any:
@@ -5917,7 +5936,7 @@ def require_strides(
         if is_storage_and_layout(x):
             if isinstance(x.get_layout(), FlexibleLayout):
                 if order:
-                    # If the the FlexibleLayout already has the size and stride in the required order,
+                    # If the FlexibleLayout already has the size and stride in the required order,
                     # freeze it to a FixedLayout by using its current size and stride.
                     # The behavior of using its current size and stride or the given order can be different
                     # if the size and stride has ambiguilty, for example for a 4D input where the iC = 1:
@@ -7051,28 +7070,7 @@ class ScatterFallback(ExternKernel):
     """
 
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
-        reduce = self.kwargs["reduce"]
-        if V.graph.cpp_wrapper:
-            # Follow aten/src/ATen/native/ReductionType.h:get_operator_enum
-            get_operator_enum = {"add": "sum", "multiply": "prod"}
-            if reduce in get_operator_enum:
-                reduce = get_operator_enum[reduce]
-
-        assert is_node_sequence(self.inputs)
-        if self.src_is_tensor:
-            (x, index, src) = (t.codegen_reference() for t in self.inputs)
-        else:
-            (x, index) = (t.codegen_reference() for t in self.inputs)
-            src = self.constant_args[1]
-        wrapper.generate_scatter_fallback(
-            x,
-            [x, self.constant_args[0], index, src],
-            self.cpp_kernel_name,
-            self.python_kernel_name,
-            self.src_is_tensor,
-            reduce,
-            self.codegen_kwargs(),
-        )
+        wrapper.generate_scatter_fallback(self)
 
     def should_allocate(self) -> bool:
         return False
@@ -7127,19 +7125,7 @@ class IndexPutFallback(ExternKernel):
     """
 
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
-        assert is_node_sequence(self.inputs)
-        (x, values, *valid_indices) = (t.codegen_reference() for t in self.inputs)
-        indices = []
-        iter_valid_indices = iter(valid_indices)
-        for i, _ in enumerate(self.indices):
-            if self.indices[i] is not None:
-                indices.append(next(iter_valid_indices))
-            else:
-                indices.append(V.graph.wrapper_code.none_str)
-
-        wrapper.generate_index_put_fallback(
-            self.get_kernel_name(), x, indices, values, *self.codegen_const_args()
-        )
+        wrapper.generate_index_put_fallback(self)
 
     def should_allocate(self) -> bool:
         return False
@@ -7570,7 +7556,25 @@ def has_side_effects(self) -> bool:
         return get_schema_info(self.op_overload).is_mutable()
 
     def get_inputs_that_alias_output(self) -> Sequence[str]:
-        return self.alias_names
+        assert isinstance(
+            self.op_overload, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)
+        ), (
+            f"Fails to create FallbackKernel for {self.op_overload}: "
+            f"{type(self.op_overload)} not supported"
+        )
+
+        # See [Note: FallbackKernel supported operators]: for a mutating
+        # op that is auto-functionalizable, its outputs does NOT
+        # alias any of the inputs.
+        if (
+            not isinstance(self.op_overload, torch._ops.HigherOrderOperator)
+            and "_c10d_functional" not in self.op_overload.name()
+            and self.op_overload._schema.is_mutable
+            and can_auto_functionalize(self.op_overload)
+        ):
+            return []
+        else:
+            return self.alias_names
 
     def get_mutation_names(self) -> Sequence[str]:
         assert len(self.mutation_names) <= 1
@@ -8219,8 +8223,22 @@ def realize_hint(self) -> None:
             self.realize()
 
     def has_accumulated_enough_reads_by_size(self, threshold: int) -> bool:
+        from torch._inductor.utils import is_nonfreeable_buffers
+
+        size_of_reads = [
+            V.graph.get_dep_size_hint(dep)
+            for dep in self.get_reads()
+            if not is_nonfreeable_buffers(dep)
+        ]
+        if not size_of_reads:
+            return False
+        total_size = sum(size_of_reads)
+        max_size = max(size_of_reads)
+        min_size = min(size_of_reads)
         return (
-            sum(V.graph.get_dep_size_hint(dep) for dep in self.get_reads()) > threshold
+            total_size >= threshold
+            and total_size / max_size >= 2
+            and max_size == min_size
         )
 
     def has_exceeded_max_reads(self) -> bool:
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index e882be6df0df8..20d101b951c09 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
 
 import torch
 from torch._dynamo.utils import counters
@@ -28,6 +28,7 @@
 
 if TYPE_CHECKING:
     from ..ir import ChoiceCaller
+    from ..select_algorithm import KernelTemplate
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
@@ -56,7 +57,7 @@ def bmm_grid(b, m, n, meta, *, cdiv):
     stride_bn = {{stride("B", 2)}}
 
     # based on triton.ops.matmul
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(INDEX_DTYPE)
     grid_m = (M + BLOCK_M - 1) // BLOCK_M
     grid_n = (N + BLOCK_N - 1) // BLOCK_N
 
@@ -82,7 +83,7 @@ def bmm_grid(b, m, n, meta, *, cdiv):
 
     rk = tl.arange(0, BLOCK_K)
 
-    idx_q = tl.program_id(1)  # batch dimension for BMM
+    idx_q = tl.program_id(1).to(INDEX_DTYPE)  # batch dimension for BMM
     A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak + idx_q*stride_aq)
     B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn + idx_q*stride_bq)
 
@@ -101,13 +102,13 @@ def bmm_grid(b, m, n, meta, *, cdiv):
     # rematerialize rm and rn to save registers
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    idx_q = tl.program_id(1)  # batch dimension for BMM
+    idx_q = tl.program_id(1).to(INDEX_DTYPE)  # batch dimension for BMM
     idx_m = rm[:, None]
     idx_n = rn[None, :]
     mask = (idx_m < M) & (idx_n < N)
 
     # inductor generates a suffix
-    {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask")}}
+    {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
 """,
     cache_codegen_enabled_for_template=True,
 )
@@ -197,21 +198,29 @@ def may_require_contiguous(t, meta_t):
         aten_extra_kwargs = {"out_dtype": out_dtype}
 
     choices: list[ChoiceCaller] = []
+
+    # Collect all templates for unified call
+    templates_to_use: list[Union[ExternKernelChoice, KernelTemplate]] = []
+    kwarg_overrides = {}
+
     if use_aten_gemm_kernels():
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs,
-                [aten_handler],
-                name,
-                kwarg_overrides={aten_handler.uid: aten_extra_kwargs},
-            )
-        )
+        templates_to_use.append(aten_handler)
+        kwarg_overrides[aten_handler.uid] = aten_extra_kwargs
 
     if use_triton_template(layout, check_max_autotune=False):
         # TODO: add out_dtype support for Triton Template
         assert out_dtype is None, "out_dtype is not supported for Triton"
-
-        choices.extend(V.choices.get_mm_configs(kernel_inputs, [bmm_template], name))
+        templates_to_use.append(bmm_template)
+
+    # Single unified call for all templates
+    choices.extend(
+        V.choices.get_template_configs(
+            kernel_inputs,
+            templates_to_use,
+            name,
+            kwarg_overrides=kwarg_overrides,
+        )
+    )
     _, is_nonzero = _is_static_problem(layout)
     batch_stride_largest_or_zero = is_batch_stride_largest_or_zero(mat1, mat2, layout)
     if (
@@ -271,16 +280,18 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     name = "baddbmm"
     # options to tune from
     choices: list[ChoiceCaller] = []
+
+    # Collect all templates for unified call
+    templates_to_use: list[Union[ExternKernelChoice, KernelTemplate]] = []
     if use_aten_gemm_kernels():
-        choices.extend(V.choices.get_mm_configs(kernel_inputs, [aten_baddbmm], name))
+        templates_to_use.append(aten_baddbmm)
 
     if use_triton_template(layout, check_max_autotune=False):
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs,
-                [bmm_template],
-                name,
-            )
-        )
+        templates_to_use.append(bmm_template)
+
+    # Single unified call for all templates
+    choices.extend(
+        V.choices.get_template_configs(kernel_inputs, templates_to_use, name)
+    )
 
     return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
diff --git a/torch/_inductor/kernel/conv.py b/torch/_inductor/kernel/conv.py
index 6b9e9a1a32e7f..2977932c084f6 100644
--- a/torch/_inductor/kernel/conv.py
+++ b/torch/_inductor/kernel/conv.py
@@ -117,19 +117,19 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     stride_wh = {{stride("W", 2)}}
     stride_ww = {{stride("W", 3)}}
 
-    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    nhw = tl.program_id(0).to(INDEX_DTYPE) * BLOCK_M + tl.arange(0, BLOCK_M)
     idx_y_w = nhw % OUT_W
     nh = nhw // OUT_W
     idx_y_h = nh % OUT_H
     idx_n = nh // OUT_H
-    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_y_c = tl.program_id(1).to(INDEX_DTYPE) * BLOCK_N + tl.arange(0, BLOCK_N)
 
 {% if GROUPS == 1 %}
     group = 0
     GROUP_IN_C = IN_C
     GROUP_OUT_C = OUT_C
 {% else %}
-    group = tl.program_id(2)
+    group = tl.program_id(2).to(INDEX_DTYPE)
     GROUP_IN_C = IN_C // GROUPS
     GROUP_OUT_C = OUT_C // GROUPS
 {% endif %}
@@ -180,7 +180,7 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     idx_w = idx_y_w[:, None]
 
     # inductor generates a suffix
-    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
+    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
 """,
 )
 
@@ -245,21 +245,21 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     stride_wh = {{stride("W", 3)}}
     stride_ww = {{stride("W", 4)}}
 
-    ndhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    ndhw = tl.program_id(0).to(INDEX_DTYPE) * BLOCK_M + tl.arange(0, BLOCK_M)
     idx_y_w = ndhw % OUT_W
     ndh = ndhw // OUT_W
     idx_y_h = ndh % OUT_H
     nd = ndh // OUT_H
     idx_y_d = nd % OUT_D
     idx_n = nd // OUT_D
-    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_y_c = tl.program_id(1).to(INDEX_DTYPE) * BLOCK_N + tl.arange(0, BLOCK_N)
 
 {% if GROUPS == 1 %}
     group = 0
     GROUP_IN_C = IN_C
     GROUP_OUT_C = OUT_C
 {% else %}
-    group = tl.program_id(2)
+    group = tl.program_id(2).to(INDEX_DTYPE)
     GROUP_IN_C = IN_C // GROUPS
     GROUP_OUT_C = OUT_C // GROUPS
 {% endif %}
@@ -318,7 +318,7 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     idx_w = idx_y_w[:, None]
 
     # inductor generates a suffix
-    {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask")}}
+    {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
 """,
 )
 
@@ -579,7 +579,7 @@ def channels_last_conv():
         and not transposed
         and is_zeros(output_padding)
         # there are some odd models where this check fails (e.g. shufflenet_v2_x1_0)
-        and V.graph.sizevars.statically_known_equals(in_chan, x.get_size()[1])  # type: ignore[arg-type]
+        and V.graph.sizevars.statically_known_equals(in_chan * groups, x.get_size()[1])  # type: ignore[arg-type]
     ):
         if (
             is_ones(kernel_shape)
@@ -591,10 +591,12 @@ def channels_last_conv():
 
         conv_configs = V.choices.get_conv_configs(device_type)
 
+        dtype_size = x.get_dtype().itemsize
         for cfg in conv_configs(
             sympy_product([x.get_size()[0], *x.get_size()[2:]]),
             out_chan,
             in_chan,
+            dtype_size=dtype_size,
         ):
             if ndim == 2:
                 conv2d_template.maybe_append_choice(
diff --git a/torch/_inductor/kernel/flex/common.py b/torch/_inductor/kernel/flex/common.py
index aab25ac0813bb..d4668fe950156 100644
--- a/torch/_inductor/kernel/flex/common.py
+++ b/torch/_inductor/kernel/flex/common.py
@@ -11,7 +11,7 @@
 import torch
 from torch._inductor.virtualized import V
 from torch.utils._ordered_set import OrderedSet
-from torch.utils._pytree import tree_map
+from torch.utils._pytree import tree_map, tree_map_only
 
 from ...ir import (
     ComputedBuffer,
@@ -173,6 +173,22 @@ def maybe_realize(args: list[Optional[IRNode]]):
     )
 
 
+def freeze_irnodes(tree: Any) -> Any:
+    """Freeze layouts for every IRNode contained in a pytree."""
+
+    if tree is None:
+        return None
+
+    def _freeze(node: IRNode) -> IRNode:
+        try:
+            node.freeze_layout()
+        except NotImplementedError:
+            pass
+        return node
+
+    return tree_map_only(IRNode, _freeze, tree)
+
+
 def create_placeholder(
     name: str,
     dtype: torch.dtype,
diff --git a/torch/_inductor/kernel/flex/flex_attention.py b/torch/_inductor/kernel/flex/flex_attention.py
index 52144b03cf4d2..541dee398203d 100644
--- a/torch/_inductor/kernel/flex/flex_attention.py
+++ b/torch/_inductor/kernel/flex/flex_attention.py
@@ -1,16 +1,17 @@
 # mypy: allow-untyped-defs
 """Triton Implementation of the flex_attention Kernel"""
 
+from __future__ import annotations
+
 import logging
 import math
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import sympy
 
 import torch
-from torch._inductor.utils import can_use_tma
 from torch._inductor.virtualized import V
 
 from ...ir import ComputedBuffer, ExternKernel, FixedLayout, TensorBox
@@ -25,6 +26,7 @@
     create_indices_fake,
     create_num_blocks_fake_generator,
     create_placeholder,
+    freeze_irnodes,
     get_fwd_subgraph_outputs,
     infer_dense_strides,
     load_template,
@@ -36,6 +38,10 @@
 from .flex_decoding import _use_flex_decoding, create_flex_decoding_kernel
 
 
+if TYPE_CHECKING:
+    from ...template_heuristics.triton import FlexBwDConfig, FlexConfig
+
+
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 Expr = sympy.Expr
@@ -144,6 +150,7 @@ def flex_attention(
     subgraph_buffer = build_subgraph_buffer(
         placeholder_inps + list(score_mod_other_buffers), subgraph
     )
+    freeze_irnodes(subgraph_buffer)
 
     mask_graph_placeholder_inps = [
         create_placeholder(name, dtype, query.get_device())
@@ -157,6 +164,7 @@ def flex_attention(
     mask_graph_buffer = build_subgraph_buffer(
         mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
     )
+    freeze_irnodes(mask_graph_buffer)
 
     kernel_options = dict(kernel_options)
     # Mark symbols in custom kernel options as static shapes and add guards.
@@ -213,6 +221,9 @@ def flex_attention(
     score_mod_other_buffers = maybe_realize(score_mod_other_buffers)
     mask_mod_other_buffers = maybe_realize(mask_mod_other_buffers)
 
+    freeze_irnodes(score_mod_other_buffers)
+    freeze_irnodes(mask_mod_other_buffers)
+
     Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
     Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
     assert V.graph.sizevars.evaluate_expr(sympy.Eq(Bq, Bkv) | sympy.Eq(Bkv, 1)), (
@@ -280,7 +291,7 @@ def flex_attention(
 
     dtype = query.get_dtype()
     head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
-    configs = V.choices.get_flex_attention_fwd_configs(
+    configs: list[FlexConfig] = V.choices.get_flex_attention_fwd_configs(
         head_dim, dtype, query.get_device().type
     )
 
@@ -317,9 +328,6 @@ def flex_attention(
         # USE TMA = false by default
         cur_kernel_options.setdefault("USE_TMA", False)
 
-        if cur_kernel_options["USE_TMA"] and can_use_tma(query, key, value):
-            cur_kernel_options["USE_TMA"] = True
-
         cur_kernel_options.setdefault("BLOCK_M", conf.block_m)
         cur_kernel_options.setdefault("BLOCK_N", conf.block_n)
         # Blocksparse options
@@ -623,6 +631,7 @@ def flex_attention_backward(*args, **kwargs):
     fw_subgraph_buffer = build_subgraph_buffer(
         fwd_placeholder_inps + list(score_mod_other_buffers), fw_graph
     )
+    freeze_irnodes(fw_subgraph_buffer)
 
     joint_placeholder_inps = fwd_placeholder_inps + [
         create_placeholder("grad_score_mod", dtype, device)
@@ -638,6 +647,7 @@ def flex_attention_backward(*args, **kwargs):
         joint_placeholder_inps + list(score_mod_other_buffers),
         joint_graph,
     )
+    freeze_irnodes(all_joint_outputs)
 
     joint_outputs = process_joint_outputs(
         all_joint_outputs, len(joint_placeholder_inps)
@@ -655,8 +665,7 @@ def flex_attention_backward(*args, **kwargs):
     mask_graph_buffer = build_subgraph_buffer(
         mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
     )
-
-    mask_graph_buffer = mask_graph_buffer
+    freeze_irnodes(mask_graph_buffer)
 
     # Construct layout with stride order matching K
     key_size = [Bq, Hkv, seq_len_kv, qk_head_dim]
@@ -723,7 +732,7 @@ def flex_attention_backward(*args, **kwargs):
 
     dtype = query.get_dtype()
     head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
-    configs = V.choices.get_flex_attention_bwd_configs(
+    configs: list[FlexBwDConfig] = V.choices.get_flex_attention_bwd_configs(
         head_dim, dtype, query.get_device().type
     )
 
@@ -731,12 +740,13 @@ def flex_attention_backward(*args, **kwargs):
     num_consumer_groups, num_buffers_warp_spec = 0, 0
 
     original_kernel_options = kernel_options.copy()
+
     for conf in configs:
         if (
-            SPARSE_KV_BLOCK_SIZE % conf.block_m != 0
-            or SPARSE_Q_BLOCK_SIZE % conf.block_m != 0
-            or SPARSE_KV_BLOCK_SIZE % conf.block_n != 0
-            or SPARSE_Q_BLOCK_SIZE % conf.block_n != 0
+            SPARSE_KV_BLOCK_SIZE % conf.block_n1 != 0
+            or SPARSE_Q_BLOCK_SIZE % conf.block_m1 != 0
+            or SPARSE_KV_BLOCK_SIZE % conf.block_n2 != 0
+            or SPARSE_Q_BLOCK_SIZE % conf.block_m2 != 0
         ):
             continue
 
@@ -759,10 +769,10 @@ def flex_attention_backward(*args, **kwargs):
                 "num_buffers_warp_spec", num_buffers_warp_spec
             )
 
-        cur_kernel_options.setdefault("BLOCK_M1", conf.block_m)
-        cur_kernel_options.setdefault("BLOCK_N1", conf.block_n)
-        cur_kernel_options.setdefault("BLOCK_M2", conf.block_n)
-        cur_kernel_options.setdefault("BLOCK_N2", conf.block_m)
+        cur_kernel_options.setdefault("BLOCK_M1", conf.block_m1)
+        cur_kernel_options.setdefault("BLOCK_N1", conf.block_n1)
+        cur_kernel_options.setdefault("BLOCK_M2", conf.block_m2)
+        cur_kernel_options.setdefault("BLOCK_N2", conf.block_n2)
 
         # Blocksparse options
         cur_kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
index 7cee221189046..e53a1788058f4 100644
--- a/torch/_inductor/kernel/flex/flex_decoding.py
+++ b/torch/_inductor/kernel/flex/flex_decoding.py
@@ -20,6 +20,7 @@
 from .common import (
     create_indices_fake,
     create_num_blocks_fake_generator,
+    freeze_irnodes,
     get_fwd_subgraph_outputs,
     load_template,
     maybe_realize,
@@ -34,7 +35,7 @@
 def _use_flex_decoding(query, kv_indices, value, kernel_options, enable_gqa) -> bool:
     """Decide which kernel to use, return true if use flex decoding kernel.
     Note:
-       Since the number of splits is calculated based of the the number of batch and head dims
+       Since the number of splits is calculated based of the number of batch and head dims
        we need to ensure that the batch and head dims are statically known. Otherwise we just
        use the main flex_attention kernel.
     """
@@ -208,6 +209,9 @@ def create_flex_decoding_kernel(*args, **kwargs):
     score_mod_other_buffers = maybe_realize(score_mod_other_buffers)
     mask_mod_other_buffers = maybe_realize(mask_mod_other_buffers)
 
+    freeze_irnodes(score_mod_other_buffers)
+    freeze_irnodes(mask_mod_other_buffers)
+
     choices: list[Any] = []
     dtype = key.get_dtype()
     head_dim = V.graph.sizevars.guard_int(key.get_size()[-1])
diff --git a/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja b/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja
index 0a16a28c6cd43..b92ea6c14a33f 100644
--- a/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja
+++ b/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja
@@ -195,7 +195,7 @@
     mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
 
     tl.static_assert(acc.shape == [BLOCK_M, V_HEAD_DIM_ROUNDED])
-    {{store_output(("idx_zq", "idx_hq", "idx_m", "idx_d"), "acc", "mask", val_shape=["BLOCK_M", "V_HEAD_DIM_ROUNDED"])}}
+    {{store_output(("idx_zq", "idx_hq", "idx_m", "idx_d"), "acc", "mask", val_shape=("BLOCK_M", "V_HEAD_DIM_ROUNDED"))}}
 
     if OUTPUT_LOGSUMEXP:
         off_hz = off_zq * HQ + off_hq
diff --git a/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja b/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja
index f5a4dd5d3c195..3467d84475d0c 100644
--- a/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja
+++ b/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja
@@ -284,7 +284,7 @@
         # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
         # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
         tl.static_assert(dk.shape == [BLOCK_N1, QK_HEAD_DIM_ROUNDED])
-        {{store_output(("off_zq", "off_hkv", "index_n", "index_k"), "dk", "mask", indent_width=8, val_shape=["BLOCK_N1", "QK_HEAD_DIM_ROUNDED"])}}
+        {{store_output(("off_zq", "off_hkv", "index_n", "index_k"), "dk", "mask", indent_width=8, val_shape=("BLOCK_N1", "QK_HEAD_DIM_ROUNDED"))}}
 
 @triton.jit
 def bwd_dq_inner(
@@ -312,61 +312,28 @@ def bwd_dq_inner(
     tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
 
     hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
-    if not IS_DIVISIBLE:
-        if hi >= 1:
-            for start_n in range(0, hi - 1):
-                dq = bwd_dq_block_mn(
-                    {{gen_argdefs()}},
-                    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-                    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-                    stride_kn, stride_kd, stride_vn, stride_vd,
-                    kv_indices, sparse_kv_num_blocks,
-                    MATMUL_PRECISION, RCP_LN2,
-                    IS_FULL_BLOCKS,
-                )
-
-                # Increment pointers.
-                offset = get_offset_for_next_block(
-                    start_n, kv_indices, sparse_kv_num_blocks,
-                    SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
-                )
-
-                kT_ptrs += offset * stride_kn
-                vT_ptrs += offset * stride_vn
 
-                offs_n2 += offset
-
-            dq = bwd_dq_block_mn(
-                {{gen_argdefs()}},
-                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-                stride_kn, stride_kd, stride_vn, stride_vd,
-                kv_indices, sparse_kv_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
-            )
-    else:
-        for start_n in range(0, hi):
-            dq = bwd_dq_block_mn(
-                {{gen_argdefs()}},
-                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-                stride_kn, stride_kd, stride_vn, stride_vd,
-                kv_indices, sparse_kv_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS,
-            )
+    for start_n in range(0, hi):
+        dq = bwd_dq_block_mn(
+            {{gen_argdefs()}},
+            dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
 
-            # Increment pointers.
-            offset = get_offset_for_next_block(
-                start_n, kv_indices, sparse_kv_num_blocks,
-                SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
-            )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, sparse_kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+        )
 
-            kT_ptrs += offset * stride_kn
-            vT_ptrs += offset * stride_vn
+        kT_ptrs += offset * stride_kn
+        vT_ptrs += offset * stride_vn
 
-            offs_n2 += offset
+        offs_n2 += offset
 
     return dq
 
@@ -379,7 +346,7 @@ def bwd_dq_block_mn(
     stride_kn, stride_kd, stride_vn, stride_vd,
     kv_indices, sparse_kv_num_blocks,
     MATMUL_PRECISION, RCP_LN2,
-    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+    IS_FULL_BLOCKS,
 ):
     {{gen_defines() | indent_except_first(1)}}
 
@@ -390,10 +357,10 @@ def bwd_dq_block_mn(
         qk *= SM_SCALE
     # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
     pre_mod_scores = qk
-    n = get_bounded_indices(offs_n2[None, :], KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n2[None, :], KV_LEN if not IS_DIVISIBLE else None)
     # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
-    # that the M reads out of bounds prior to the last loop
-    m = get_bounded_indices(offs_m2[:, None], Q_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
+    # that the M reads out of bounds for the PIDS spanning the Q_LEN boundary
+    m = get_bounded_indices(offs_m2[:, None], Q_LEN if not IS_DIVISIBLE else None)
 
     {{ modification(
         subgraph_number=0,
@@ -406,8 +373,13 @@ def bwd_dq_block_mn(
         out="qk"
     ) | indent_except_first(1) }}
 
-    if CHECK_BLOCK_BOUNDARY:
-        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+
+    {# Note: Selective masking DQ
+    We load elements beyond KV_LEN w/ zero, some score mods may convert this elements to NaN
+    Example: lambda x, *_: 1 / score, this NaN would propagate regardless of other masking
+    We only need to do this on the m1 dim since these elements take part in the final reduction
+    for DQ #}
+    if not IS_DIVISIBLE:
         post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
 
     if not IS_FULL_BLOCKS:
@@ -421,8 +393,6 @@ def bwd_dq_block_mn(
             n="n",
         ) | indent_except_first(2) }}
 
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
         # apply mask for partial masked block
         post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -446,7 +416,8 @@ def bwd_dq_block_mn(
         n="n",
         grad_score_mod="ds"
     ) | indent_except_first(1) }}
-    if CHECK_BLOCK_BOUNDARY:
+    {# See Note Selective masking DQ #}
+    if not IS_DIVISIBLE:
         grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
 
     # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
@@ -467,8 +438,6 @@ def bwd_dq_block_mn(
     ds = grad_scores
 
     if not IS_FULL_BLOCKS:
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
         # (grads) apply mask for partially unmasked block
         ds = tl.where(mask_mod_output, ds, 0.0)
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -503,61 +472,30 @@ def bwd_dkdv_inner(
     do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
     # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
     tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
-    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
 
-    if not IS_DIVISIBLE:
-        if hi >= 1:
-            for start_m in range(0, hi - 1):
-                dk, dv = bwd_dkdv_block_mn(
-                    {{gen_argdefs()}},
-                    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-                    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-                    stride_qm, stride_qd, stride_dom, stride_dod,
-                    q_indices, sparse_q_num_blocks,
-                    MATMUL_PRECISION, RCP_LN2,
-                    IS_FULL_BLOCKS,
-                )
-                # Increment pointers.
-                offset = get_offset_for_next_block(
-                    start_m, q_indices, sparse_q_num_blocks,
-                    SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
-                )
-
-                qT_ptrs += offset * stride_qm
-                do_ptrs += offset * stride_dom
-
-                offs_m1 += offset
-
-            dk, dv = bwd_dkdv_block_mn(
-                {{gen_argdefs()}},
-                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-                stride_qm, stride_qd, stride_dom, stride_dod,
-                q_indices, sparse_q_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
-            )
-    else:
-        for start_m in range(0, hi):
-            dk, dv = bwd_dkdv_block_mn(
-                {{gen_argdefs()}},
-                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-                stride_qm, stride_qd, stride_dom, stride_dod,
-                q_indices, sparse_q_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS,
-            )
-            # Increment pointers.
-            offset = get_offset_for_next_block(
-                start_m, q_indices, sparse_q_num_blocks,
-                SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
-            )
+    # The minimum is needed to handle the case where we run with a super large
+    # SPARSE_BLOCK_SIZE (i.e. no block-mask!)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
 
-            qT_ptrs += offset * stride_qm
-            do_ptrs += offset * stride_dom
+    for start_m in range(0, hi):
+        dk, dv = bwd_dkdv_block_mn(
+            {{gen_argdefs()}},
+            dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+            stride_qm, stride_qd, stride_dom, stride_dod,
+            q_indices, sparse_q_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_m, q_indices, sparse_q_num_blocks,
+            SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+        )
 
-            offs_m1 += offset
+        qT_ptrs += offset * stride_qm
+        do_ptrs += offset * stride_dom
+        offs_m1 += offset
 
     return dk, dv
 
@@ -570,7 +508,7 @@ def bwd_dkdv_block_mn(
     stride_qm, stride_qd, stride_dom, stride_dod,
     q_indices, sparse_q_num_blocks,
     MATMUL_PRECISION, RCP_LN2,
-    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+    IS_FULL_BLOCKS,
 ):
     {{gen_defines() | indent_except_first(1) }}
 
@@ -586,10 +524,10 @@ def bwd_dkdv_block_mn(
     if not PRESCALE_QK:
         qkT *= SM_SCALE
     # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
-    m = get_bounded_indices(offs_m1[None, :], Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    m = get_bounded_indices(offs_m1[None, :], Q_LEN if not IS_DIVISIBLE else None)
     # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
-    # that the n reads out of bounds prior to the last loop
-    n = get_bounded_indices(offs_n1[:, None], KV_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
+    # that the n reads out of bounds for the PIDS spanning the KV_LEN boundary
+    n = get_bounded_indices(offs_n1[:, None], KV_LEN if not IS_DIVISIBLE else None)
 
     pre_mod_scores = qkT
     {{ modification(
@@ -603,22 +541,23 @@ def bwd_dkdv_block_mn(
         out="qkT"
     ) | indent_except_first(1) }}
 
-    if CHECK_BLOCK_BOUNDARY:
-        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
-        post_mod_scores = tl.where(offs_n1[:, None] < KV_LEN, post_mod_scores, float("-inf"))
+    {# Note: Selective masking DK/DV
+    We load elements beyond Q_LEN w/ zero, some score mods may convert this elements to NaN
+    Example: lambda x, *_: 1 / score, this NaN would propagate regardless of other masking
+    We only need to do this on the m1 dim since these elements take part in the final reduction
+    for DK/DV #}
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_m1[None, :] < Q_LEN, post_mod_scores, float("-inf"))
 
     if not IS_FULL_BLOCKS:
         {{ modification(
             subgraph_number=2,
             output_name="mask_mod_output",
-            score="qkT",
             b="off_z",
             h="off_hq",
             m="m",
             n="n",
         ) | indent_except_first(2) }}
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
         # (grads) apply mask for fully masked block
         post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -648,6 +587,10 @@ def bwd_dkdv_block_mn(
         grad_score_mod="dsT"
     ) | indent_except_first(1) }}
 
+    {# See Note: Selective masking DK/DV#}
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_m1[None, :] < Q_LEN, grad_scores, 0.0)
+
     # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
     if not WRITE_DQ:
         idx_b = off_z
@@ -667,17 +610,11 @@ def bwd_dkdv_block_mn(
             grad_score_mod="dsT"
         ) | indent_except_first(2) }}
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    if CHECK_BLOCK_BOUNDARY:
-        grad_scores = tl.where(offs_n1[:, None] < KV_LEN, grad_scores, 0.0)
-
     dsT = grad_scores
     if not IS_FULL_BLOCKS:
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
         # (grads) apply mask for partially unmasked block
         dsT = tl.where(mask_mod_output, dsT, 0.0)
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
 
-    return dk, dv
\ No newline at end of file
+    return dk, dv
diff --git a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
index 57adc1cd69d63..e5f0e118c5631 100644
--- a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
+++ b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
@@ -239,4 +239,4 @@
 
     mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
     acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
-    {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
+    {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask", val_shape=("GQA_SHARED_HEADS", "BLOCK_M_PER_HQ", "V_HEAD_DIM"))}}
diff --git a/torch/_inductor/kernel/flex/templates/utilities.py.jinja b/torch/_inductor/kernel/flex/templates/utilities.py.jinja
index 7e2367e4f2692..0c40b43277f8a 100644
--- a/torch/_inductor/kernel/flex/templates/utilities.py.jinja
+++ b/torch/_inductor/kernel/flex/templates/utilities.py.jinja
@@ -42,7 +42,7 @@ def load_checked_2d(
     IS_DIVISIBLE_M: tl.constexpr,
     IS_DIVISIBLE_N: tl.constexpr,
     M_LEN: tl.constexpr,
-    N_DIM: tl.constexpr,
+    N_LEN: tl.constexpr,
 ):
     # Calculate final pointer if strides are provided
     if stride_m is not None and stride_n is not None:
@@ -50,9 +50,9 @@ def load_checked_2d(
 
     # Handle all masking cases
     if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
-        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_DIM), other=0.0)
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
     elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
-        return tl.load(ptr, mask=(offs_n[None, :] < N_DIM), other=0.0)
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
     elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
         return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
     else:  # Both divisible
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index 155c461775cbc..253d76ec27f9b 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import torch
 from torch._dynamo.utils import counters
@@ -23,12 +23,13 @@
 from ..codegen.rocm.ck_tile_universal_gemm_template import CKTileGemmTemplate
 from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
 from ..codegen.subgraph import SubgraphChoiceCaller, SubgraphTemplate
-from ..ir import Buffer, ChoiceCaller, FlexibleLayout, is_triton, Layout
+from ..ir import Buffer, ChoiceCaller, is_triton, Layout
 from ..kernel_inputs import MMKernelInputs
 from ..lowering import add_layout_constraint, constrain_to_fx_strides, register_lowering
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
+    KernelTemplate,
     realize_inputs,
     TritonTemplate,
 )
@@ -40,6 +41,7 @@
     use_cpp_gemm_template,
     use_cutlass_template,
     use_decompose_k_choice,
+    use_triton_blackwell_tma_template,
     use_triton_template,
     use_triton_tma_template,
 )
@@ -77,7 +79,7 @@
     stride_bn = {{stride("B", 1)}}
 
     # based on triton.ops.matmul
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(INDEX_DTYPE)
     grid_m = (M + BLOCK_M - 1) // BLOCK_M
     grid_n = (N + BLOCK_N - 1) // BLOCK_N
 
@@ -113,11 +115,13 @@
 
         idx_m = offs_a_m[:, None]
         idx_n = a_k_idx_vals
-        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}
+        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask",
+                     indent_width=8, index_shape=("BLOCK_M", "BLOCK_K"))}}
 
         idx_m = b_k_idx_vals
         idx_n = offs_b_n[None, :]
-        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
+        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask",
+                     indent_width=8, index_shape=("BLOCK_K", "BLOCK_N"))}}
 
         {% if USE_FAST_ACCUM %}
         acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
@@ -133,7 +137,7 @@
     mask = (idx_m < M) & (idx_n < N)
 
     # inductor generates a suffix
-    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+    {{store_output(("idx_m", "idx_n"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
 """
         if (torch.version.hip is None) or triton_version >= "3.3.0"
         # FIXME: To get around rocm failures like https://github.com/pytorch/pytorch/actions/runs/13123783322/job/36617154943
@@ -153,7 +157,7 @@
     stride_bn = {{stride("B", 1)}}
 
     # based on triton.ops.matmul
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(INDEX_DTYPE)
     grid_m = (M + BLOCK_M - 1) // BLOCK_M
     grid_n = (N + BLOCK_N - 1) // BLOCK_N
 
@@ -189,11 +193,13 @@
 
         idx_m = offs_a_m[:, None]
         idx_n = a_k_idx_vals
-        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}
+        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask",
+                     indent_width=8, index_shape=("BLOCK_M", "BLOCK_K"))}}
 
         idx_m = b_k_idx_vals
         idx_n = offs_b_n[None, :]
-        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
+        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask",
+                     indent_width=8, index_shape=("BLOCK_K", "BLOCK_N"))}}
         {% if USE_FAST_ACCUM %}
         acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
         {% else %}
@@ -208,7 +214,7 @@
     mask = (idx_m < M) & (idx_n < N)
 
     # inductor generates a suffix
-    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+    {{store_output(("idx_m", "idx_n"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
 """
     ),
     cache_codegen_enabled_for_template=True,
@@ -227,7 +233,7 @@
         # early exit due to zero-size input(s)
         return
 
-    start_pid = tl.program_id(0)
+    start_pid = tl.program_id(0).to(INDEX_DTYPE)
     grid_m = tl.cdiv(M, BLOCK_M)
     grid_n = tl.cdiv(N, BLOCK_N)
     k_tiles = tl.cdiv(K, BLOCK_K)
@@ -335,15 +341,18 @@
         )
 
         if ki == k_tiles - 1:
+            # inductor generates a suffix
+            {%- if TMA_EXPERIMENTAL_API %}
             # rematerialize rm and rn to save registers
             rcm = rm + tl.arange(0, BLOCK_M)
             rcn = rn + tl.arange(0, BLOCK_N)
             idx_m = rcm[:, None]
             idx_n = rcn[None, :]
             mask = (idx_m < M) & (idx_n < N)
-
-            # inductor generates a suffix
-            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
+            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12, val_shape=("BLOCK_M", "BLOCK_N"))}}
+            {%- else %}
+            {{store_output(("rm", "rn"), "acc", indent_width=12, val_shape=("BLOCK_M", "BLOCK_N"), block_indexing=True)}}
+            {%- endif %}
             acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
 
 """,
@@ -419,7 +428,7 @@ def apply_scaling(
         stride_a_scale_m = 0
         stride_b_scale_n = 0
 
-    start_pid = tl.program_id(axis=0)
+    start_pid = tl.program_id(axis=0).to(INDEX_DTYPE)
     num_pid_m = tl.cdiv(M, BLOCK_M)
     num_pid_n = tl.cdiv(N, BLOCK_N)
     k_tiles = tl.cdiv(K, BLOCK_K)
@@ -530,11 +539,21 @@ def apply_scaling(
                 stride_b_scale_n,
             )
 
+            # inductor generates a suffix
+            {%- if TMA_EXPERIMENTAL_API %}
             idx_m = offs_cm[:, None]
             idx_n = offs_cn[None, :]
             mask = (idx_m < M) & (idx_n < N)
-            # inductor generates a suffix
-            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
+            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12, val_shape=("BLOCK_M", "BLOCK_N"))}}
+            {%- else %}
+            {{store_output(
+                ("offs_am", "offs_bn"),
+                "accumulator",
+                indent_width=12,
+                val_shape=("BLOCK_M", "BLOCK_N"),
+                block_indexing=True,
+            )}}
+            {%- endif %}
             accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 """
 
@@ -545,6 +564,124 @@ def apply_scaling(
     source=device_tma + load_scales + apply_scaling,
 )
 
+_compute_blackwell_pid = r"""
+@triton.jit
+def _compute_pid(tile_id, num_pid_in_group, grid_m, GROUP_M: tl.constexpr, NUM_SMS: tl.constexpr):
+    group_id = tile_id // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    GROUP_M = min(grid_m - first_pid_m, GROUP_M)
+    pid_m = first_pid_m + (tile_id % GROUP_M)
+    pid_n = (tile_id % num_pid_in_group) // GROUP_M
+    return pid_m, pid_n
+"""
+
+_blackwell_ws_persistent_device_tma = r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    start_pid = tl.program_id(0)
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    k_tiles = tl.cdiv(K, BLOCK_K)
+    num_tiles = grid_m * grid_n
+
+    # Note: We require TMA_EXPERIMENTAL_API == False, which
+    # we will check before invoking this template.
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+    a_desc = triton.language.make_tensor_descriptor(
+        base=A,
+        shape=[M, K] if A_ROW_MAJOR else [K, M],
+        strides=[stride_am, 1] if A_ROW_MAJOR else [stride_ak, 1],
+        block_shape=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
+    )
+    b_desc = triton.language.make_tensor_descriptor(
+        base=B,
+        shape=[K, N] if B_ROW_MAJOR else [N, K],
+        strides=[stride_bk, 1] if B_ROW_MAJOR else [stride_bn, 1],
+        block_shape=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
+    )
+
+    # tile_id_c is used in the epilogue to break the dependency between
+    # the prologue and the epilogue
+    tile_id_c = start_pid - NUM_SMS
+    num_pid_in_group = GROUP_M * grid_n
+
+    for tile_id in tl.range(
+        start_pid, num_tiles, NUM_SMS, flatten=FLATTEN, warp_specialize=WARP_SPECIALIZE
+    ):
+        pid_m, pid_n = _compute_pid(
+            tile_id, num_pid_in_group, grid_m, GROUP_M, NUM_SMS
+        )
+        offs_am = pid_m * BLOCK_M
+        offs_bn = pid_n * BLOCK_N
+
+        accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        for ki in range(k_tiles):
+            offs_k = ki * BLOCK_K
+            a = tl.load_tensor_descriptor(
+                a_desc,
+                [offs_am, offs_k] if A_ROW_MAJOR else [offs_k, offs_am],
+            )
+            b = tl.load_tensor_descriptor(
+                b_desc,
+                [offs_k, offs_bn] if B_ROW_MAJOR else [offs_bn, offs_k],
+            )
+            accumulator += tl.dot(
+                a if A_ROW_MAJOR else a.T,
+                b if B_ROW_MAJOR else b.T,
+                allow_tf32=ALLOW_TF32,
+            )
+
+        tile_id_c += NUM_SMS
+        pid_m, pid_n = _compute_pid(
+            tile_id_c, num_pid_in_group, grid_m, GROUP_M, NUM_SMS
+        )
+        offs_cm = pid_m * BLOCK_M
+        offs_cn = pid_n * BLOCK_N
+        {%- if EPILOGUE_SUBTILE %}
+        tl.static_assert(BLOCK_N % 2 == 0)
+        acc = tl.reshape(accumulator, (BLOCK_M, 2, BLOCK_N // 2))
+        acc = tl.permute(acc, (0, 2, 1))
+        acc0, acc1 = tl.split(acc)
+        {{store_output(
+            ("offs_cm", "offs_cn"),
+            "acc0",
+            indent_width=8,
+            val_shape=("BLOCK_M", "BLOCK_N // 2"),
+            block_indexing=True
+        )}}
+        offs_cn2 = offs_cn + BLOCK_N // 2
+        {{store_output(
+            ("offs_cm", "offs_cn2"),
+            "acc1",
+            indent_width=8,
+            val_shape=("BLOCK_M", "BLOCK_N // 2"),
+            block_indexing=True
+        )}}
+        {%- else %}
+        {{store_output(
+            ("offs_cm", "offs_cn"),
+            "accumulator",
+            indent_width=8,
+            val_shape=("BLOCK_M", "BLOCK_N"),
+            block_indexing=True
+        )}}
+        {%- endif %}
+"""
+
+blackwell_ws_persistent_device_tma_mm_template = TritonTemplate(
+    name="blackwell_ws_persistent_device_tma",
+    grid=persistent_mm_grid,
+    source=_blackwell_ws_persistent_device_tma + _compute_blackwell_pid,
+)
+
 
 # prevent duplication registration of extern functions
 @functools.cache
@@ -553,6 +690,12 @@ def lazy_register_extern_choice(fn):
 
 
 aten_mm = ExternKernelChoice(torch.mm, "at::mm_out", op_overload=aten.mm.out)
+aten_mm_dtype = ExternKernelChoice(
+    torch.mm,
+    "at::_mm_dtype_out_cuda",
+    name="mm_dtype",
+    op_overload=aten.mm.dtype_out,
+)
 
 aten_addmm = ExternKernelChoice(
     torch.addmm, "at::addmm_out", op_overload=aten.addmm.out
@@ -721,17 +864,38 @@ def contiguous_addmm(inp, a, b):
 
 
 @register_lowering(aten.mm, type_promotion_kind=None)
-def tuned_mm(mat1, mat2, *, layout=None):
+def tuned_mm(mat1, mat2, out_dtype=None, *, layout=None):
     """
     Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
     """
+    if out_dtype is not None:
+        input_dtype = mat1.get_dtype()
+        torch._check(
+            mat2.get_dtype() == input_dtype,
+            lambda: "input dtypes must be the same",
+        )
+        torch._check(
+            mat1.get_device().type == "cuda",
+            lambda: "out_dtype is only supported for CUDA",
+        )
+        torch._check(
+            out_dtype == input_dtype
+            or (
+                out_dtype == torch.float32
+                and input_dtype in (torch.float16, torch.bfloat16)
+            ),
+            lambda: "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs",
+        )
+
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
-    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    m, n, k, layout, mat1, mat2 = mm_args(
+        mat1, mat2, layout=layout, out_dtype=out_dtype
+    )
     static_shape, is_nonzero = _is_static_problem(layout)
     name = "mm"
 
     # Create MMKernelInputs for standard MM at the top
-    kernel_inputs = MMKernelInputs([mat1, mat2])
+    kernel_inputs = MMKernelInputs([mat1, mat2], out_dtype=out_dtype)
 
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten.mm_{m}_{n}_{k}"] += 1
@@ -745,43 +909,52 @@ def tuned_mm(mat1, mat2, *, layout=None):
         layout,
     )
 
-    aten_layout = layout
-    if not (inductor_config.max_autotune or inductor_config.max_autotune_gemm):
-        aten_layout = FlexibleLayout(
-            device=layout.device, dtype=layout.dtype, size=layout.size
-        )
     choices: list[ChoiceCaller] = []
-    if use_aten_gemm_kernels():
-        choices.extend(
-            V.choices.get_mm_configs(kernel_inputs, [aten_mm], "mm", aten_layout)
-        )
     static_shape, is_nonzero = _is_static_problem(layout)
 
-    if is_nonzero and use_triton_template(layout, check_max_autotune=False):
-        # Get template choices using the new unified function
-        choices.extend(V.choices.get_mm_configs(kernel_inputs, [mm_template], "mm"))
-        if use_triton_tma_template(mat1, mat2):
-            # Get TMA template choices using the new unified function
-            choices.extend(
-                V.choices.get_mm_configs(
-                    kernel_inputs, [persistent_tma_mm_template], "mm"
-                )
-            )
+    aten_handler: ExternKernelChoice = aten_mm
+    aten_extra_kwargs: dict[str, Any] = {}
+    if out_dtype is not None:
+        aten_handler = aten_mm_dtype
+        aten_extra_kwargs = {"out_dtype": out_dtype}
+
+    templates_to_use: list[Union[ExternKernelChoice, KernelTemplate]] = []
+    kwarg_overrides: dict[str, dict[str, Any]] = {}
+    if use_aten_gemm_kernels():
+        templates_to_use.append(aten_handler)
+        if aten_extra_kwargs:
+            kwarg_overrides[aten_handler.uid] = aten_extra_kwargs
+
+    if (
+        out_dtype is None
+        and is_nonzero
+        and use_triton_template(layout, check_max_autotune=True)
+    ):
+        templates_to_use.append(mm_template)
+
+        if use_triton_tma_template(mat1, mat2, output_layout=layout):
+            templates_to_use.append(persistent_tma_mm_template)
+
+        if use_triton_blackwell_tma_template(mat1, mat2, output_layout=layout):
+            templates_to_use.append(blackwell_ws_persistent_device_tma_mm_template)
 
         if use_decompose_k_choice(m, n, k):
-            choices.extend(
-                V.choices.get_mm_configs(
-                    kernel_inputs, [decompose_k_subgraph_template], "mm"
-                )
-            )
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs, [mm_contiguous_subgraph_template], "mm"
-            )
+            templates_to_use.append(decompose_k_subgraph_template)
+
+        templates_to_use.append(mm_contiguous_subgraph_template)
+
+    choices.extend(
+        V.choices.get_template_configs(
+            kernel_inputs,
+            templates_to_use,
+            "mm",
+            kwarg_overrides=kwarg_overrides,
         )
+    )
 
     if (
-        is_nonzero
+        out_dtype is None
+        and is_nonzero
         and use_cutlass_template(layout, m, n, k)
         and _use_cutlass_for_op("mm")
     ):
@@ -789,12 +962,12 @@ def tuned_mm(mat1, mat2, *, layout=None):
             choices, layout, kernel_inputs.nodes()
         )
 
-    if is_nonzero and use_ck_gemm_template(layout, m, n, k):
+    if out_dtype is None and is_nonzero and use_ck_gemm_template(layout, m, n, k):
         CKGemmTemplate.add_ck_gemm_choices(choices, layout, kernel_inputs.nodes())
-    if is_nonzero and use_ck_tile_gemm_template(layout, m, n, k):
+    if out_dtype is None and is_nonzero and use_ck_tile_gemm_template(layout, m, n, k):
         CKTileGemmTemplate.add_choices(choices, layout, kernel_inputs.nodes())
 
-    if use_cpp_gemm_template(layout, mat1, mat2):
+    if out_dtype is None and use_cpp_gemm_template(layout, mat1, mat2):
         CppGemmTemplate.add_choices(
             choices,
             layout,
@@ -803,7 +976,8 @@ def tuned_mm(mat1, mat2, *, layout=None):
 
     input_nodes = [mat1, mat2]
     if (
-        is_nonzero
+        out_dtype is None
+        and is_nonzero
         and use_triton_template(layout)
         and torch._inductor.config.run_autoheuristic(name)
         and is_triton(mat1)
@@ -813,7 +987,7 @@ def tuned_mm(mat1, mat2, *, layout=None):
             always_included.append("extern_mm")
         num_choices_before_extra_configs = len(choices)
         choices.extend(
-            V.choices.get_mm_configs(
+            V.choices.get_template_configs(
                 # TODO(coconutruben): remove once we deprecate ah
                 # mm-extra is a hack to keep the ah functionality alive
                 # while we transition to the unified kwargs retrieval
@@ -849,15 +1023,16 @@ def tuned_mm(mat1, mat2, *, layout=None):
             else:
                 choices = choices[:num_choices_before_extra_configs]
 
-    for k in inductor_config.external_matmul:
-        choices.append(
-            lazy_register_extern_choice(k).bind(kernel_inputs.nodes(), layout)
-        )
+    if out_dtype is None:
+        for k in inductor_config.external_matmul:
+            choices.append(
+                lazy_register_extern_choice(k).bind(kernel_inputs.nodes(), layout)
+            )
 
     best_config_future = None
-    # Purposely not awaiting the future here - this kicks off the best config lookup at lowering time
-    # The future will be awaited at scheduling time in select_algorithm.py
-    if torch._inductor.config.remote_gemm_autotune_cache:
+    if out_dtype is None and torch._inductor.config.remote_gemm_autotune_cache:
+        # Purposely not awaiting the future here - this kicks off the best config lookup at lowering time
+        # The future will be awaited at scheduling time in select_algorithm.py
         best_config_future = gen_best_config(mat1, mat2)
 
     return autotune_select_algorithm(
@@ -894,25 +1069,27 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
 
     # Create MMKernelInputs for Int MM
     kernel_inputs = MMKernelInputs([mat1, mat2], out_dtype=torch.int32)
+
+    # Collect all templates for unified call
+    templates_to_use: list[Union[ExternKernelChoice, KernelTemplate]] = []
     if use_aten_gemm_kernels():
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs,
-                [aten__int_mm],
-                name,
-            )
-        )
+        templates_to_use.append(aten__int_mm)
+
+    if is_nonzero and use_triton_template(
+        layout, enable_int32=True, check_max_autotune=False
+    ):
+        templates_to_use.append(mm_template)
+
+    # Single unified call for all templates
+    choices.extend(
+        V.choices.get_template_configs(kernel_inputs, templates_to_use, name)
+    )
 
     if use_cutlass and _use_cutlass_for_op(name):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices, layout, kernel_inputs.nodes(), fuseable=True, non_fuseable=True
         )
 
-    if is_nonzero and use_triton_template(
-        layout, enable_int32=True, check_max_autotune=False
-    ):
-        choices.extend(V.choices.get_mm_configs(kernel_inputs, [mm_template], name))
-
     return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
 
@@ -942,18 +1119,9 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         mat2.get_dtype(),
         layout,
     )
-    aten_layout = layout
     if (not is_nonzero) or (
         not (inductor_config.max_autotune or inductor_config.max_autotune_gemm)
     ):
-        # Use a FlexibleLayout if we are not autotuning.
-        # This allows padding strides for the output.
-        from torch._inductor.ir import FixedLayout, FlexibleLayout
-
-        if isinstance(layout, FixedLayout):
-            aten_layout = FlexibleLayout(
-                device=layout.device, dtype=layout.dtype, size=layout.size
-            )
         # TODO(coconutruben): combine this with the main flow of addmm through
         # a subgraph or something as inp vs inp_expanded causes some slight numeric
         # differences
@@ -961,59 +1129,34 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
             [inp, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
         )
         choices.extend(
-            V.choices.get_mm_configs(
+            V.choices.get_template_configs(
                 kernel_inputs,
                 [aten_addmm],
                 name,
-                aten_layout,
             )
         )
         return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
+    # Collect all templates for unified call
+    templates_to_use: list[Union[ExternKernelChoice, KernelTemplate]] = []
     if use_aten_gemm_kernels():
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs,
-                [aten_bias_addmm],
-                name,
-            )
-        )
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs,
-                [aten_addmm],
-                name,
-            )
-        )
+        templates_to_use.extend([aten_bias_addmm, aten_addmm])
 
     if is_nonzero and use_triton_template(layout, check_max_autotune=False):
-        # all the triton templates use the extra_kwargs
-        # Get template choices using the new unified function
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs,
-                [mm_template],
-                name,
-            )
-        )
+        templates_to_use.append(mm_template)
 
-        if use_triton_tma_template(mat1, mat2):
-            # Get TMA template choices using the new unified function
-            choices.extend(
-                V.choices.get_mm_configs(
-                    kernel_inputs,
-                    [persistent_tma_mm_template],
-                    name,
-                )
-            )
+        if use_triton_tma_template(mat1, mat2, output_layout=layout):
+            templates_to_use.append(persistent_tma_mm_template)
 
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs,
-                [addmm_contiguous_subgraph_template],
-                "addmm",
-            )
-        )
+        if use_triton_blackwell_tma_template(mat1, mat2, output_layout=layout):
+            templates_to_use.append(blackwell_ws_persistent_device_tma_mm_template)
+
+        templates_to_use.append(addmm_contiguous_subgraph_template)
+
+    # Single unified call for all templates
+    choices.extend(
+        V.choices.get_template_configs(kernel_inputs, templates_to_use, name)
+    )
 
     if (
         is_nonzero
@@ -1168,52 +1311,58 @@ def tuned_scaled_mm(
     )
 
     choices: list[ChoiceCaller] = []
+
+    # Collect all templates for unified call
+    templates_to_use: list[Union[ExternKernelChoice, KernelTemplate]] = []
+    kwarg_overrides = {}
+
     if use_aten_gemm_kernels():
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs,
-                [aten__fp8_mm],
-                name,
-                kwarg_overrides={
-                    aten__fp8_mm.uid: dict(
-                        out_dtype=out_dtype, use_fast_accum=use_fast_accum
-                    )
-                },
-            )
+        templates_to_use.append(aten__fp8_mm)
+        kwarg_overrides[aten__fp8_mm.uid] = dict(
+            out_dtype=out_dtype, use_fast_accum=use_fast_accum
         )
 
-    # We dont have triton lowerings for the MX variants yet
-    if scale_a.dtype != torch.float32:
-        return autotune_select_algorithm(name, choices, input_nodes, layout)
-
     _, is_nonzero = _is_static_problem(layout)
 
-    if is_nonzero and use_triton_template(
-        layout, enable_float8=True, check_max_autotune=False
+    if (
+        # We dont have triton lowerings for the MX variants yet
+        scale_a.dtype == torch.float32
+        and is_nonzero
+        and use_triton_template(layout, enable_float8=True, check_max_autotune=False)
     ):
         overriders = dict(USE_FAST_ACCUM=use_fast_accum)
+
         # TODO (paulzhan): There is no template that exists for bias and TMA
-        # Don't run tma template currently if bias exists
-        if use_triton_tma_template(mat_a, mat_b) and not bias:
-            # Get TMA template choices using the new unified function
-            choices.extend(
-                V.choices.get_mm_configs(
-                    kernel_inputs,
-                    [scaled_mm_device_tma_template],
-                    name,
-                    kwarg_overrides={scaled_mm_device_tma_template.uid: overriders},
-                )
+        # Don't run tma template currently if bias exist
+        if use_triton_tma_template(mat_a, mat_b, output_layout=layout) and not bias:
+            templates_to_use.append(scaled_mm_device_tma_template)
+            kwarg_overrides[scaled_mm_device_tma_template.uid] = overriders
+
+        if (
+            use_triton_blackwell_tma_template(mat_a, mat_b, output_layout=layout)
+            and not bias
+        ):
+            templates_to_use.append(blackwell_ws_persistent_device_tma_mm_template)
+            kwarg_overrides[blackwell_ws_persistent_device_tma_mm_template.uid] = (
+                overriders
             )
 
-        # Get template choices using the new unified function
-        choices.extend(
-            V.choices.get_mm_configs(
-                kernel_inputs,
-                [mm_template],
-                name,
-                kwarg_overrides={mm_template.uid: overriders},
-            )
+        templates_to_use.append(mm_template)
+        kwarg_overrides[mm_template.uid] = overriders
+
+    # Single unified call for all templates
+    choices.extend(
+        V.choices.get_template_configs(
+            kernel_inputs,
+            templates_to_use,
+            name,
+            kwarg_overrides=kwarg_overrides,
         )
+    )
+
+    # Early return for MX variants
+    if scale_a.dtype != torch.float32:
+        return autotune_select_algorithm(name, choices, input_nodes, layout)
 
     if (
         is_nonzero
diff --git a/torch/_inductor/kernel/mm_grouped.py b/torch/_inductor/kernel/mm_grouped.py
index 3424585e1214c..1467224f6b9f5 100644
--- a/torch/_inductor/kernel/mm_grouped.py
+++ b/torch/_inductor/kernel/mm_grouped.py
@@ -135,7 +135,7 @@ def early_config_prune(g, m, configs, named_args):
 {{def_kernel("a_ptr", "b_ptr")}}
 {%- endif %}
 {%- endif %}
-    tidx = tl.program_id(0)
+    tidx = tl.program_id(0).to(INDEX_DTYPE)
 
 {%- set M_IS_VARYING = A_IS_2D and not B_IS_2D %}
 {%- set N_IS_VARYING = not A_IS_2D and B_IS_2D %}
@@ -389,9 +389,9 @@ def early_config_prune(g, m, configs, named_args):
 {%- endif %}
                 mask = (offs_am[:, None] < m_size) & (offs_bn[None, :] < n_size)
 {%- if M_IS_VARYING or N_IS_VARYING %}
-                {{store_output(("idx_m", "idx_n"), "c", "mask", indent_width=16)}}
+                {{store_output(("idx_m", "idx_n"), "c", "mask", indent_width=16, val_shape=("BLOCK_M", "BLOCK_N"))}}
 {%- else %}
-                {{store_output(("g", "idx_m", "idx_n"), "c", "mask", indent_width=16)}}
+                {{store_output(("g", "idx_m", "idx_n"), "c", "mask", indent_width=16, val_shape=("BLOCK_M", "BLOCK_N"))}}
 {%- endif %}
                 tidx += NUM_SMS
 
@@ -491,7 +491,7 @@ def can_use_triton_kernel(
 ) -> bool:
     if not (
         torch.cuda.is_available()
-        and torch.cuda.get_device_capability() >= (9, 0)
+        and torch.cuda.get_device_capability() == (9, 0)
         and not torch.version.hip
     ):
         return False
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
index 2133931815949..df94e3e5cd7bb 100644
--- a/torch/_inductor/kernel/mm_plus_mm.py
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 
 import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
 
 import torch
 
@@ -19,6 +19,7 @@
 
 if TYPE_CHECKING:
     from torch._inductor.ir import ChoiceCaller
+    from torch._inductor.select_algorithm import KernelTemplate
 
 log = logging.getLogger(__name__)
 
@@ -51,7 +52,7 @@
     stride_dn = {{stride("D", 1)}}
 
     # based on triton.ops.matmul
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(INDEX_DTYPE)
     grid_m = (M + BLOCK_M - 1) // BLOCK_M
     grid_n = (N + BLOCK_N - 1) // BLOCK_N
 
@@ -117,7 +118,7 @@
     mask = (idx_m < M) & (idx_n < N)
 
     # inductor generates a suffix
-    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+    {{store_output(("idx_m", "idx_n"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
 """,
     cache_codegen_enabled_for_template=True,
 )
@@ -155,16 +156,19 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
     assert layout1 == layout2
     # options to tune from
     choices: list[ChoiceCaller] = []
+
+    # Collect all templates for unified call
+    templates_to_use: list[Union[ExternKernelChoice, KernelTemplate]] = []
     if use_aten_gemm_kernels():
-        choices.extend(
-            V.choices.get_mm_configs(kernel_inputs, [aten_mm_plus_mm], "mm_plus_mm")
-        )
+        templates_to_use.append(aten_mm_plus_mm)
 
     if use_triton_template(layout1, check_max_autotune=False):
-        # Get template choices using the new unified function
-        choices.extend(
-            V.choices.get_mm_configs(kernel_inputs, [mm_plus_mm_template], "mm_plus_mm")
-        )
+        templates_to_use.append(mm_plus_mm_template)
+
+    # Single unified call for all templates
+    choices.extend(
+        V.choices.get_template_configs(kernel_inputs, templates_to_use, "mm_plus_mm")
+    )
 
     return autotune_select_algorithm(
         "mm_plus_mm", choices, kernel_inputs.nodes(), layout1
diff --git a/torch/_inductor/kernel_template_choice.py b/torch/_inductor/kernel_template_choice.py
index ac42eaf5b95b0..8f90157c6c1a0 100644
--- a/torch/_inductor/kernel_template_choice.py
+++ b/torch/_inductor/kernel_template_choice.py
@@ -2,6 +2,8 @@
 
 from typing import Any, Optional, TYPE_CHECKING, Union
 
+from .template_heuristics.params import DictKernelTemplateParams
+
 
 if TYPE_CHECKING:
     from collections.abc import Generator
@@ -10,6 +12,7 @@
     from .ir import ChoiceCaller, Layout
     from .kernel_inputs import KernelInputs
     from .select_algorithm import ExternKernelChoice
+    from .template_heuristics.params import KernelTemplateParams
 
 
 class KernelTemplateChoice:
@@ -23,23 +26,24 @@ class KernelTemplateChoice:
     def __init__(
         self,
         template: Union[KernelTemplate, ExternKernelChoice],
-        kwargs: dict[str, Any],
+        params: KernelTemplateParams,
         extra_kwargs: dict[str, Any],
         layout: Layout,
         inputs: KernelInputs,
     ):
         self.template = template
-        self.kwargs = kwargs
+        self.params = params
         self.extra_kwargs = extra_kwargs
         self.layout = layout
         self.inputs = inputs
+        self.annotations: dict[str, Any] = {"ktc": self}
 
     @property
     def choice(self) -> Optional[ChoiceCaller]:
         """
         Lazily evaluate and return the ChoiceCaller for this template choice.
 
-        On first access, calls template.choice_or_None() with the stored parameters.
+        On first access, calls template.choice_or_none() with the stored parameters.
         If successful, caches and returns the ChoiceCaller. If it fails, caches
         and returns None. Subsequent accesses return the cached value.
 
@@ -48,20 +52,23 @@ def choice(self) -> Optional[ChoiceCaller]:
         """
         if not hasattr(self, "_choice"):
             # First time accessing choice - try to generate it
+            kwargs = self.params.to_kwargs()
             self._choice = self.template.choice_or_none(
-                **self.kwargs,
+                **kwargs,
+                **self.extra_kwargs,
                 layout=self.layout,
                 input_nodes=self.inputs.nodes(),
-                **self.extra_kwargs,
             )
+            if self._choice is not None:
+                self._choice.annotations = self.annotations
         return self._choice
 
 
 def make_ktc_generator(
     template: Union[KernelTemplate, ExternKernelChoice],
-    cs: Generator[dict[str, Any], None, None],
-    overrides: dict[str, Any],
+    cs: Generator[KernelTemplateParams, None, None],
     extra_kwargs: dict[str, Any],
+    overrides: dict[str, Any],
     layout: Layout,
     inputs: KernelInputs,
 ) -> Generator[KernelTemplateChoice, None, None]:
@@ -70,19 +77,22 @@ def make_ktc_generator(
 
     Args:
         template: The template object (KernelTemplate or ExternKernelChoice)
-        cs: Generator of configurations from template heuristic
+        cs: Generator of KernelTemplateParams from template heuristic
         overrides: Override kwargs for the template
-        extra_kwargs: Extra kwargs from the heuristic
-        layout_val: Layout value for the template
+        layout: Layout value for the template
         inputs: KernelInputs for the op
 
     Yields:
         KernelTemplateChoice objects
     """
-    for ckwargs in cs:
+    for params in cs:
+        # Apply overrides to params
+        base_kwargs = params.to_kwargs()
+        final_kwargs = {**base_kwargs, **overrides}
+        final_params = DictKernelTemplateParams(final_kwargs)
         yield KernelTemplateChoice(
             template=template,
-            kwargs={**ckwargs, **overrides},
+            params=final_params,
             extra_kwargs=extra_kwargs,
             layout=layout,
             inputs=inputs,
diff --git a/torch/_inductor/loop_body.py b/torch/_inductor/loop_body.py
index 5ae38810fa134..76bd6fc872437 100644
--- a/torch/_inductor/loop_body.py
+++ b/torch/_inductor/loop_body.py
@@ -103,7 +103,15 @@ class LoopBody:
     memory_usage: dict[MemoryUsageType, list[MemoryEntry]]
     op_counts: collections.Counter[str]
 
-    def __init__(self, fn, args, var_ranges, iter_vars, reduce_vars):
+    def __init__(
+        self,
+        fn,
+        args,
+        var_ranges,
+        iter_vars,
+        reduce_vars,
+        allow_same_symbol_in_index=False,
+    ):
         super().__init__()
 
         _flat_sizes = tuple(var_ranges.values())
@@ -117,7 +125,7 @@ def __init__(self, fn, args, var_ranges, iter_vars, reduce_vars):
         self.var_ranges = var_ranges
 
         if isinstance(fn, LoopBody):
-            self._init_with_copy(fn, args)
+            self._init_with_copy(fn, args, allow_same_symbol_in_index)
         else:
             self._init_with_tracing(fn, args)
 
@@ -136,13 +144,13 @@ def _init_with_tracing(self, fn, args):
         self.root_block = LoopBodyBlock(self, fn, args)  # traces
         del self.indexing_exprs_name  # not used after _init_with_tracing
 
-    def _init_with_copy(self, other: LoopBody, args):
+    def _init_with_copy(self, other: LoopBody, args, allow_same_symbol_in_index):
         """
         _init_with_tracing() is slow, so this is a fast path in the case
         where we are just reordering/merging/splitting the args of an
         existing LoopBody.
         """
-        indexing_exprs = other.indexing_from_args(args)
+        indexing_exprs = other.indexing_from_args(args, allow_same_symbol_in_index)
         self.indexing_exprs = {
             name: V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges)
             for name, expr in indexing_exprs.items()
@@ -187,41 +195,26 @@ def merge_loops(self) -> LoopBody:
             index_prevent_reordering(index_exprs, old_reduce_vars, old_reduce_sizes),
         )
 
-        # if iter_sizes == old_iter_sizes:
-        #     # no dimensions get merged.
-        #     return old_sizes, old_body
-
-        # Note: if no dimension get merges, the symbol prefix will
-        # remain 'y'. But if we merge dimensions, we change prefix to
-        # 'z'. If this is an issue, we can always retrace the LoopBody
-        # to change symbol prefix to 'z'.
-        #
-        # There is indeed an issue due to symbol name conflicting.
-        # y0 maybe reused for the y dimension later.
+        if iter_sizes == old_iter_sizes and reduce_sizes == old_reduce_sizes:
+            return old_body
+
         (
             (
                 iter_vars,
                 reduce_vars,
             ),
             var_ranges,
-        ) = dependencies.index_vars_no_squeeze(iter_sizes, reduce_sizes, prefix="t")
+        ) = dependencies.index_vars_no_squeeze(iter_sizes, reduce_sizes, prefix="p")
         new_body = LoopBody(
             old_body,
             [iter_reindex(iter_vars), reduce_reindex(reduce_vars)],
             var_ranges,
             iter_vars,
             reduce_vars,
+            allow_same_symbol_in_index=True,
         )
 
-        # use the original symbol prefix
-        # Can try to optimize if this is a bottleneck for compilation time
-        (iter_vars2, reduce_vars2), var_ranges2 = dependencies.index_vars_no_squeeze(
-            iter_sizes, reduce_sizes, prefix="p"
-        )
-        new_body2 = LoopBody(
-            new_body, (iter_vars2, reduce_vars2), var_ranges2, iter_vars2, reduce_vars2
-        )
-        return new_body2
+        return new_body
 
     def expand_dimension_for_pointwise_node(
         self, dimension: int, new_range: int
@@ -288,7 +281,7 @@ def reorder_iter_loops(self, new_order) -> LoopBody:
 
         (iter_vars, reduce_vars), var_ranges = dependencies.index_vars_no_squeeze(
             *new_sizes,
-            prefix="t",  # type: ignore[arg-type]
+            prefix="p",  # type: ignore[arg-type]
         )
 
         inverse_order = {b: a for a, b in enumerate(new_order)}
@@ -300,21 +293,15 @@ def new_body(*indices: Sequence[sympy.Expr]) -> Any:
             iter_idx = index[: len(iter_size)]
             reduce_idx = index[len(iter_size) :]
             iter_idx = [iter_idx[i] for i in inverse_order]
-            return old_body(iter_idx, reduce_idx)
-
-        loop_body = LoopBody(
-            new_body, (iter_vars, reduce_vars), var_ranges, iter_vars, reduce_vars
-        )
+            return old_body(iter_idx, reduce_idx, allow_same_symbol_in_index=True)
 
-        # use the original symbol prefix so we can do multiple round of reordering
-        (iter_vars2, reduce_vars2), var_ranges2 = dependencies.index_vars_no_squeeze(
-            *new_sizes,
-            prefix="p",  # type: ignore[arg-type]
-        )
-        new_body = LoopBody(
-            loop_body, (iter_vars2, reduce_vars2), var_ranges2, iter_vars2, reduce_vars2
+        return LoopBody(
+            new_body,
+            (iter_vars, reduce_vars),
+            var_ranges,
+            iter_vars,
+            reduce_vars,
         )
-        return new_body
 
     @property
     def vars(self):
@@ -455,20 +442,21 @@ def get_index(self, name):
         assert self.indexing is not None
         return self.indexing[name]
 
-    def indexing_from_args(self, indices):
+    def indexing_from_args(self, indices, allow_same_symbol_in_index=False):
         index = [*itertools.chain.from_iterable(indices)]
         assert len(index) == len(self.var_ranges), (index, self.var_ranges)
-        assert all(v not in self.var_ranges for v in index), (
-            f"{self.var_ranges=}, {indices=}"
-        )
+        assert allow_same_symbol_in_index or all(
+            v not in self.var_ranges for v in index
+        ), f"{self.var_ranges=}, {indices=}"
+
         replacements = dict(zip(self.var_ranges.keys(), index))
         return {
             name: sympy_subs(expr, replacements)
             for name, expr in self.indexing_exprs.items()
         }
 
-    def __call__(self, *indices):
-        self.indexing = self.indexing_from_args(indices)
+    def __call__(self, *indices, allow_same_symbol_in_index=False):
+        self.indexing = self.indexing_from_args(indices, allow_same_symbol_in_index)
         result = self.root_block()
         self.indexing = None
         return result
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index d05bdd1354694..51e226caf2f33 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -47,7 +47,13 @@
     resolve_unbacked_bindings,
 )
 from torch.utils._ordered_set import OrderedSet
-from torch.utils._sympy.functions import CeilDiv, FloorDiv, Identity, ModularIndexing
+from torch.utils._sympy.functions import (
+    CeilDiv,
+    FloorDiv,
+    Identity,
+    Mod,
+    ModularIndexing,
+)
 
 from .._dynamo.utils import import_submodule
 from . import config, inductor_prims, ir, test_operators  # NOQA: F401
@@ -631,8 +637,8 @@ def inner(*inputs: TensorBox, alpha=None):
             and getattr(V.graph, "current_node", None) is not None
             and V.graph.current_node.meta is not None
             and V.graph.current_node.meta.get("low_precision_pointwise_barrier", False)
-            and dtype in low_pr_fp
         )
+        emulate_output_cast = emulate_precision_casts and dtype in low_pr_fp
 
         def inner_fn(index):
             assert len(index) == len(ranges), f"wrong ndim {index} {ranges}"
@@ -649,7 +655,7 @@ def inner_fn(index):
                     inputs_loaded.append(out)
 
                 out = fn(*inputs_loaded)
-                if emulate_precision_casts:
+                if emulate_output_cast:
                     # fp16/bf16 kernels are computed in fp32. Casting down to fp16/bf16 here,
                     # then upcasting again, to emulate casts that eager would do.
                     downcast = ops.to_dtype(out, dtype, use_compute_types=False)
@@ -1202,16 +1208,26 @@ def slice_(x, dim=0, start=0, end=2**63, step=1, clamp=True):
 
 @register_lowering(aten.as_strided, type_promotion_kind=None)
 def as_strided(x, size, stride, storage_offset=None):
+    new_device = None
+    new_dtype = None
     if isinstance(x, TensorBox) and isinstance(x.data, ir.BaseView):
-        # as_strided ignores views
+        # Note: Merging views
+        # When we use as_strided, we can rewrite the size/stride/offset
+        # of the incoming buffer x. If x is a view, we would overwrite
+        # its metadata. Except for dtype, which we need to propagate.
+
+        # Technically device is not needed because it is not possible
+        # to have a cross-device view today.
+        new_device = x.get_device()
+        new_dtype = x.dtype
         x = x.data.unwrap_view()
     x.realize()
     if not ir.is_storage_and_layout(x):
         raise NotImplementedError(f"unrealized as_strided({x}, ...)")
     storage, old_layout = ir.as_storage_and_layout(x)
     new_layout = ir.FixedLayout(
-        old_layout.device,
-        old_layout.dtype,
+        new_device if new_device else old_layout.device,
+        new_dtype if new_dtype else old_layout.dtype,
         [sympy.expand(s) for s in size],
         [sympy.expand(s) for s in stride],
         sympy.expand(storage_offset or 0),
@@ -2635,7 +2651,7 @@ def apply_constraint(idx, arg, fx_arg):
         if len(arg.get_size()) not in (3, 4):
             return arg
 
-        is_aligned_tensor = ir.is_aligned_realized_tensor_hint(arg, ALIGNMENT)
+        is_aligned_tensor = ir.is_aligned_realized_tensor(arg, ALIGNMENT)
         if is_aligned_tensor:
             return ir.try_match_insignificant_strides(
                 ir.ExternKernel.realize_input(arg), meta_stride_expr
@@ -2703,7 +2719,9 @@ def apply_constraint(idx, arg, fx_arg):
             )
 
         def is_aligned(x):
-            return (V.graph.sizevars.size_hint(x.get_size()[-1]) % ALIGNMENT) == 0
+            return V.graph.sizevars.guard_or_false(
+                sympy.Eq(Mod(x.get_size()[-1], ALIGNMENT), 0)
+            )
 
         if isinstance(arg.data, ir.BaseView):
             if not is_aligned(arg):
@@ -2795,7 +2813,7 @@ def is_aligned(x):
 make_fallback(aten.upsample_linear1d_backward)
 make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
 make_fallback(aten.upsample_trilinear3d_backward)
-make_fallback(aten.grid_sampler_2d_backward, require_dense)
+make_fallback(aten.grid_sampler_2d_backward)
 make_fallback(aten._pdist_backward)
 
 
@@ -3717,8 +3735,8 @@ def index_put_as_masked_fill(self, indices, value, accumulate):
 
 
 def index_put_fallback(self, indices, values, accumulate):
-    assert isinstance(V.graph.current_node.target, torch._ops.OpOverload)
-    ir.IndexPutFallback(V.graph.current_node.target, self, indices, values, accumulate)
+    op_overload = getattr(aten.index_put_, V.graph.current_node.target._overloadname)  # type: ignore[union-attr]
+    ir.IndexPutFallback(op_overload, self, indices, values, accumulate)
     return self
 
 
@@ -4781,11 +4799,11 @@ def max_pool2d_with_indices_backward(
     new_size = list(x.get_size())
 
     h_window_size = max(
-        max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
+        max(FloorDiv(h, stride[0]) - max(0, FloorDiv(h - kernel_size[0], stride[0])), 1)
         for h in range(kernel_size[0] * 2)
     )
     w_window_size = max(
-        max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
+        max(FloorDiv(w, stride[1]) - max(0, FloorDiv(w - kernel_size[1], stride[1])), 1)
         for w in range(kernel_size[1] * 2)
     )
 
@@ -5031,7 +5049,7 @@ def _adaptive_avg_pool2d(x, output_size):
         o_size = [*batch, h_out, w_out]
         return empty(o_size, dtype=x.get_dtype(), device=x.get_device())
     if h_in % h_out == 0 and w_in % w_out == 0:
-        kernel_size = [h_in // h_out, w_in // w_out]
+        kernel_size = [FloorDiv(h_in, h_out), FloorDiv(w_in, w_out)]
         return avg_pool2d(x, kernel_size)
 
     h_kernel_max = ceildiv((h_in + h_out - 1), h_out)
@@ -5299,7 +5317,9 @@ def upsample_nearest2d_backward(
     *_batch, out_h, out_w = input_size
 
     if inp_h % out_h == 0 and inp_w % out_w == 0:
-        return avg_pool2d(x, [inp_h // out_h, inp_w // out_w], divisor_override=1)
+        return avg_pool2d(
+            x, [FloorDiv(inp_h, out_h), FloorDiv(inp_w, out_w)], divisor_override=1
+        )
 
     h_kernel_max = ceildiv(inp_h, out_h)
     w_kernel_max = ceildiv(inp_w, out_w)
@@ -5553,11 +5573,11 @@ def avg_pool2d_backward(
     dtype = x.get_dtype()
 
     h_window_size = max(
-        max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
+        max(FloorDiv(h, stride[0]) - max(0, FloorDiv(h - kernel_size[0], stride[0])), 1)
         for h in range(kernel_size[0] * 2)
     )
     w_window_size = max(
-        max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
+        max(FloorDiv(w, stride[1]) - max(0, FloorDiv(w - kernel_size[1], stride[1])), 1)
         for w in range(kernel_size[1] * 2)
     )
 
@@ -7218,9 +7238,8 @@ def prepare_softmax_online(x, dim):
         reduction_numel=rnumel,
     )
 
-    if (
-        num_split == 1
-        and V.graph.sizevars.size_hint(rnumel) >= config.unroll_reductions_threshold
+    if num_split == 1 and V.graph.sizevars.statically_known_geq(
+        rnumel, config.unroll_reductions_threshold
     ):
         max_tensor, sum_tensor = OnlineSoftmaxReduction.create(
             input_node=x, num_output=2, reduction_hint=hint, **kwargs
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
index 27ca4415c8f0e..1a02dbb1e6af4 100644
--- a/torch/_inductor/memory.py
+++ b/torch/_inductor/memory.py
@@ -10,8 +10,9 @@
 from torch._utils_internal import signpost_event
 from torch.utils._ordered_set import OrderedSet
 
+from . import config
 from .ir import MultiOutputLayout, NoneLayout
-from .utils import get_dtype_size
+from .utils import get_dtype_size, is_nonfreeable_buffers
 from .virtualized import V
 
 
@@ -92,14 +93,7 @@ def _dep_size_hint(dep: Dep) -> int:
     for node in nodes:
         for dep in node.read_writes.reads:
             if dep.name in graph_inputs:
-                dep_name = dep.name
-                # Subgraphs have a prefix for the name, cleanup the prefix
-                # before checking for known strings.
-                if V.graph.name:
-                    dep_name = dep_name.removeprefix(V.graph.name + "_")
-                if not dep_name.startswith(
-                    ("primals_", "arg", "fwd_rng_state", "bwd_rng_state")
-                ):
+                if not is_nonfreeable_buffers(dep):
                     dep_name_to_succ_nodes[dep.name].add(node)
                     dep_name_to_size[dep.name] = _dep_size_hint(dep)
 
@@ -574,6 +568,7 @@ class BufferInfo(TypedDict):
         elif buf_name in name_to_freeable_input_buf:
             output_memory += name_to_freeable_input_buf[buf_name].mpi_buffer.size_free
     max_memory = max(live_memory, output_memory)
+    memory_gap = max_memory - live_memory
 
     # compute the amount of memory that is allocated when a node is scheduled
     # and the amount of memory that can be freed when a node is scheduled
@@ -589,17 +584,33 @@ class BufferInfo(TypedDict):
 
     # schedule nodes one at a time
     schedule: list[BaseSchedulerNode] = []
+    size_threshold = config.size_threshold_for_succ_based_strategy
     num_iters: int = 0
     while num_iters < len(nodes) and nodes_to_schedule:
         # select a node to schedule:
-        selected_node = min(
-            nodes_to_schedule,
-            key=lambda node: (
-                max(live_memory + node.mpi_node.size, max_memory),
-                node.mpi_node.size - node_info[node]["memory_to_free"],
-                node.mpi_node.index,
-            ),
-        )
+        if (
+            size_threshold > 0
+            and min(node.mpi_node.size for node in nodes_to_schedule) > size_threshold
+        ):
+            selected_node = min(
+                nodes_to_schedule,
+                key=lambda node: min(
+                    (
+                        succ_node.mpi_node.index
+                        for succ_node in node.mpi_node.succ_nodes
+                    ),
+                    default=len(nodes),
+                ),
+            )
+        else:
+            selected_node = min(
+                nodes_to_schedule,
+                key=lambda node: (
+                    node.mpi_node.size if node.mpi_node.size > memory_gap else 0,
+                    node.mpi_node.size - node_info[node]["memory_to_free"],
+                    node.mpi_node.index,
+                ),
+            )
         nodes_to_schedule.remove(selected_node)
         schedule.append(selected_node)
         num_iters += 1
@@ -608,6 +619,7 @@ class BufferInfo(TypedDict):
         live_memory += selected_node.mpi_node.size
         max_memory = max(max_memory, live_memory)
         live_memory -= node_info[selected_node]["memory_to_free"]
+        memory_gap = max_memory - live_memory
 
         # update successor nodes and nodes_to_schedule
         for succ_node in selected_node.mpi_node.succ_nodes:
@@ -887,6 +899,16 @@ def reorder_for_peak_memory(
         graph_outputs,
     )
 
+    # export graph for simulator if needed
+    if config.reorder_for_peak_memory_debug:
+        export_graph_for_simulator(
+            nodes,
+            name_to_freeable_input_buf,
+            name_to_fused_node,
+            graph_inputs,
+            graph_outputs,
+        )
+
     # Validate planning info before proceeding with reordering
     try:
         validate_graph_acyclic(nodes)
@@ -937,3 +959,112 @@ def reorder_for_peak_memory(
     best_result = min(peak_memory_diff_methods, key=lambda x: x.peak_memory)
 
     return best_result.order
+
+
+def export_graph_for_simulator(
+    nodes: list[BaseSchedulerNode],
+    name_to_freeable_input_buf: dict[str, FreeableInputBuffer],
+    name_to_fused_node: dict[str, BaseSchedulerNode],
+    graph_inputs: OrderedSet[str],
+    graph_outputs: OrderedSet[str],
+) -> None:
+    """
+    This is for debugging purposes. It will dump a json file that records graph information.
+    The graph can then be used in a simulator: https://fburl.com/code/3l3d3qi4
+    """
+
+    class ORMBuffer(TypedDict):
+        name: str
+        size_alloc: int
+        size_free: int
+        size: int  # for backward compatibility
+        is_input: bool
+        is_output: bool
+        deps: list[str]
+        unmet_deps: list[str]
+
+    class ORMNode(TypedDict):
+        name: str
+        buffer_names: list[str]
+
+    class ORMGraph(TypedDict):
+        nodes: list[ORMNode]
+        buffers: list[ORMBuffer]
+
+    orm_buffers: list[ORMBuffer] = []
+    orm_nodes: list[ORMNode] = []
+
+    # get orm buffers for freeable input buffers
+    for buf_name, input_buf in name_to_freeable_input_buf.items():
+        orm_buf_input_buffer: ORMBuffer = {
+            "name": buf_name,
+            "size_alloc": input_buf.mpi_buffer.size_free,
+            "size_free": input_buf.mpi_buffer.size_free,
+            "size": input_buf.mpi_buffer.size_free,
+            "is_input": True,
+            "is_output": buf_name in graph_outputs,
+            "deps": [],
+            "unmet_deps": [],
+        }
+        orm_buffers.append(orm_buf_input_buffer)
+
+    # get orm buffers for scheduler buffers
+    name_to_buf: dict[str, SchedulerBuffer] = {
+        buf.get_name(): buf for node in nodes for buf in node.get_outputs()
+    }  # need to reassign due to probably node pruning
+    for buf_name, sched_buf in name_to_buf.items():
+        if sched_buf.defining_op is None:
+            continue
+        deps = [
+            pred_buf.get_name()
+            for pred_buf in name_to_fused_node[
+                sched_buf.defining_op.get_name()
+            ].mpi_node.pred_buffers
+        ]
+        orm_buf_scheduler_buffer: ORMBuffer = {
+            "name": buf_name,
+            "size_alloc": sched_buf.mpi_buffer.size_alloc,
+            "size_free": sched_buf.mpi_buffer.size_free,
+            "size": sched_buf.mpi_buffer.size_free,
+            "is_input": False,
+            "is_output": buf_name in graph_outputs,
+            "deps": deps,
+            "unmet_deps": [
+                buf_name for buf_name in deps if buf_name not in graph_inputs
+            ],
+        }
+        orm_buffers.append(orm_buf_scheduler_buffer)
+
+    # get orm nodes
+    for node in nodes:
+        orm_node: ORMNode = {
+            "name": node.get_name(),
+            "buffer_names": list(node.get_buffer_names()),
+        }
+        orm_nodes.append(orm_node)
+
+    # create the graph object
+    g: ORMGraph = {
+        "nodes": orm_nodes,
+        "buffers": orm_buffers,
+    }
+
+    # dump the graph
+    import json
+    import os
+
+    import torch
+    from functorch.compile import get_graph_being_compiled
+
+    name = os.path.splitext(get_graph_being_compiled())[0] + "_fused"
+
+    g_str = json.dumps(g, indent=2)
+
+    torch._logging.trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": name,
+            "encoding": "string",
+        },
+        payload_fn=lambda: g_str,
+    )
diff --git a/torch/_inductor/ops_handler.py b/torch/_inductor/ops_handler.py
index a52257c61480c..cccb0e2943622 100644
--- a/torch/_inductor/ops_handler.py
+++ b/torch/_inductor/ops_handler.py
@@ -19,7 +19,7 @@
 
 
 T = TypeVar("T")
-StoreMode = Optional[Literal["atomic_add"]]
+StoreMode = Optional[Literal["atomic_add", "tma"]]
 ReductionType = Literal[
     "argmax",
     "argmin",
diff --git a/torch/_inductor/runtime/autotune_cache.py b/torch/_inductor/runtime/autotune_cache.py
index 88b9c80c77146..4363641a1f311 100644
--- a/torch/_inductor/runtime/autotune_cache.py
+++ b/torch/_inductor/runtime/autotune_cache.py
@@ -35,7 +35,6 @@
 from typing_extensions import override
 
 import torch
-from torch._dynamo.precompile_context import PrecompileContext
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.compiler._cache import (
     CacheArtifact,
@@ -302,10 +301,6 @@ def save(
             CacheArtifactManager.record_artifact(
                 AutotuneCacheArtifact.type(), autotune_artifact_key, data
             )
-            if torch._dynamo.config.caching_precompile:
-                PrecompileContext.record_artifact(
-                    AutotuneCacheArtifact.type(), autotune_artifact_key, data
-                )
 
             if log.isEnabledFor(logging.DEBUG):
                 type_str = "coordesc" if found_by_coordesc else "heuristic"
@@ -631,10 +626,6 @@ def _get(self, key: str, sample: Optional[Sample]) -> Optional[JsonDataTy]:
             CacheArtifactManager.record_artifact(
                 AutotuneCacheArtifact.type(), autotune_artifact_key, result
             )
-            if torch._dynamo.config.caching_precompile:
-                PrecompileContext.record_artifact(
-                    AutotuneCacheArtifact.type(), autotune_artifact_key, result
-                )
         return result
 
     @override
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 0c51d840a61c9..f9af640e5dc34 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -755,6 +755,8 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
             "debug": compile_meta["debug"],
             "sanitize_overflow": False,  # turn off additional asserts added for overflow checks
         }
+        if "enable_fp_fusion" in compile_meta:
+            options["enable_fp_fusion"] = compile_meta["enable_fp_fusion"]
         if HAS_WARP_SPEC:
             options.update(
                 {
@@ -764,6 +766,15 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
                     ),
                 }
             )
+        if self.device_props.type == "cuda":
+            options.update(
+                {
+                    "launch_cooperative_grid": compile_meta.get(
+                        "launch_cooperative_grid", False
+                    ),
+                    "launch_pdl": compile_meta.get("launch_pdl", False),  # True
+                }
+            )
         if self.device_props.type == "hip":
             if "waves_per_eu" in compile_meta:
                 options["waves_per_eu"] = compile_meta["waves_per_eu"]
@@ -902,7 +913,11 @@ def copy_args_to_cpu_if_needed(self, *args, **kwargs):
             return {}
 
         copies = {}
-        budget = torch.cuda.max_memory_allocated() - torch.cuda.memory_allocated()
+        try:
+            budget = torch.cuda.max_memory_allocated() - torch.cuda.memory_allocated()
+        except RuntimeError:
+            # Possibly a custom CUDA allocator, see https://github.com/pytorch/pytorch/issues/163257
+            return {}
 
         def maybe_copy(name, arg):
             if name in self.mutated_arg_names and arg.is_cuda:
@@ -1100,6 +1115,15 @@ def save_gpu_kernel(self, stream, launcher):
             "global_scratch": launcher.global_scratch,
             "profile_scratch": launcher.profile_scratch,
         }
+        if self.device_props.type == "xpu":
+            # On the XPU backend, threads_per_warp is not always 32.
+            # For Intel GEMM Triton kernels, it can be 16.
+            # This information must be preserved so that the Cpp wrapper
+            # can launch the kernel with the correct configuration.
+            params["threads_per_warp"] = getattr(
+                launcher.bin.metadata, "threads_per_warp", 32
+            )
+
         from torch._inductor.codecache import CudaKernelParamCache
 
         bin_type = {"hip": "hsaco", "xpu": "spv"}.get(self.device_props.type, "cubin")
@@ -1311,11 +1335,23 @@ def _interpret_args_grid(
 
             def filtered_signature() -> list[str]:
                 # constexprs are not passed in as args
-                return [
-                    x
-                    for x in self.triton_meta["signature"].keys()
-                    if x not in cfg.kwargs.keys()
-                ]
+                new_signature: list[str] = []
+                from triton.runtime.interpreter import InterpretedFunction
+
+                for i, x in enumerate(self.triton_meta["signature"].keys()):
+                    if isinstance(self.fn, InterpretedFunction):
+                        # These are torch compiled triton kernels that definitely
+                        # have block size configs. Dynamo does not currently
+                        # trace user defined triton kernels when TRITON_INTERPRET=1
+                        if x not in cfg.kwargs.keys():
+                            new_signature.append(x)
+                    elif i not in self.fn.constexprs:
+                        # use constexprs rather than just configs since user
+                        # defined triton kernels may not have any configs
+                        new_signature.append(x)
+
+                return new_signature
+
         else:
 
             def filtered_signature() -> list[str]:
@@ -1502,6 +1538,13 @@ def check_can_launch() -> StaticallyLaunchedCudaKernel:
                 # Requires storing the entire binary
                 raise CannotStaticallyLaunchKernel("store_cubin is enabled")
 
+            if getattr(kernel.metadata, "launch_pdl", False) or getattr(
+                kernel.metadata, "launch_cooperative_grid", False
+            ):
+                raise CannotStaticallyLaunchKernel(
+                    "static launch does not support launch attributes"
+                )
+
             cubin_location = os.path.join(
                 triton_cache_dir(triton_meta.get("device", 0)),
                 triton_hash_to_path_key(kernel.hash),
@@ -1938,7 +1981,6 @@ def run(self, *args, stream, **kwargs):
             kernel_name = f"{max(possible_names, key=len)}"
             if not re.match(self.regex_filter, kernel_name):
                 return
-
             if len(self.launchers) != 1:
                 if len(self.launchers) == 0:
                     start_time = time.time_ns()
@@ -2291,6 +2333,7 @@ def triton_config_reduction(
     num_warps=None,
     register_intensive=False,
     dynamic_scale_rblock=True,
+    reduction_hint=None,
 ) -> Config:
     """
     Construct a reduction triton config with some adjustment heuristics
@@ -2318,9 +2361,16 @@ def total_numel() -> int:
             rnumels[prefix] *= 2
 
     if num_warps is None:
-        num_warps = total_numel() // 128
+        if reduction_hint == ReductionHint.INNER:
+            # r is contiguous, so ensure that each thread has 8 elements for
+            # vectorized loads, assuming bf16/fp16
+            num_warps = r // (32 * 8)
+        else:
+            num_warps = total_numel() // 128
+
+    max_num_warps = 16 if r <= 8192 else 32
     num_warps = _num_warps(
-        num_warps, max_num_warps=16, register_intensive=register_intensive
+        num_warps, max_num_warps=max_num_warps, register_intensive=register_intensive
     )
 
     x, _num_blocks = _check_max_grid_x(size_hints, x, num_warps)
@@ -2586,6 +2636,7 @@ def make_config(
                 num_stages=num_stages,
                 register_intensive=register_intensive,
                 dynamic_scale_rblock=dynamic_scale_rblock,
+                reduction_hint=reduction_hint,
             )
 
     def outer_config_opt():
@@ -2637,7 +2688,7 @@ def outer_config_opt():
         )
 
     contiguous_config = make_config(
-        1,
+        1 if rnumel > 2048 else 2,  # 1024 or less is persistent
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
@@ -2659,7 +2710,7 @@ def outer_config_opt():
         xnumel = max(4096 // rnumel, 1)
         c = make_config(
             xnumel,
-            rnumel,
+            min(rnumel, 32768),
             register_intensive=register_intensive,
             dynamic_scale_rblock=False,
         )
@@ -2859,6 +2910,9 @@ def _persistent_reduction_configs(
 ):
     xnumel = size_hints["x"]
     rnumel = get_total_reduction_numel(size_hints)
+    loads_and_stores = inductor_meta.get("num_load", 0) + inductor_meta.get(
+        "num_store", 0
+    )
 
     max_autotune_enabled = not disable_pointwise_autotuning(inductor_meta) or (
         inductor_meta.get("max_autotune")
@@ -2870,10 +2924,16 @@ def _persistent_reduction_configs(
         for xblock in (1, 8, 32, 128)
         if xblock == 1 or (xblock <= xnumel and (max_autotune_enabled or rnumel * xblock <= 4096))
     ]
-    
+
     if "y" not in size_hints:
         configs = [
-            triton_config_reduction(size_hints, xblock, rnumel, register_intensive=True)
+            triton_config_reduction(
+                size_hints,
+                xblock,
+                rnumel,
+                register_intensive=True,
+                reduction_hint=reduction_hint,
+            )
             for xblock in (1, 8, 32, 128)
             if xblock == 1
             or (rnumel * xblock <= 4096 and xblock <= xnumel)
@@ -2895,30 +2955,49 @@ def _persistent_reduction_configs(
                 )
             )
 
+    tiny_configs = [
+        triton_config_reduction(
+            size_hints,
+            2 * (256 // rnumel) if rnumel <= 256 else 1,
+            rnumel,
+            reduction_hint=reduction_hint,
+        )
+    ]
     # defer to more autotuning, initially
     if "y" in size_hints:
         pass
+    # TODO(jansel): we should be able to improve these heuristics
+    elif not max_autotune_enabled: # Don't filter if tuning enabled
+        if reduction_hint == ReductionHint.INNER:
+            if rnumel > 1024:
+                configs = configs[:1]
+            else:
+                x_block = 8
+                if xnumel // x_block < 128 or (loads_and_stores >= 5 and rnumel >= 256):
+                    # If loads/stores greater than 5, a lot of register pressure
+                    # rnumel < 256 means no vectorized loads if we split up r dim
+                    # so xblock still needs to be larger
+                    x_block = 1
+
+                configs = [
+                    triton_config_reduction(
+                        size_hints,
+                        x_block,
+                        rnumel,
+                        register_intensive=True,
+                        reduction_hint=reduction_hint,
+                    )
+                ]
 
-    if not max_autotune_enabled: # Don't filter if tuning enabled
-        if reduction_hint == ReductionHint.INNER and rnumel >= 256:
-            configs = configs[:1]
         elif reduction_hint == ReductionHint.OUTER:
             configs = configs[-1:]
-
-    if reduction_hint == ReductionHint.OUTER_TINY:
-        tiny_configs = [
-            triton_config_reduction(
-                size_hints,
-                2 * (256 // rnumel) if rnumel <= 256 else 1,
-                rnumel,
-            )
-        ]
-        if max_autotune_enabled:
-            for tconfig in tiny_configs:
-                if tconfig not in configs:
-                    configs.append(tconfig)
-            else:
-                configs = tiny_configs
+        elif reduction_hint == ReductionHint.OUTER_TINY:
+            configs = tiny_configs
+    else:
+		# If autotune is enabled append tiny configs
+        for conf in tiny_configs:
+            if conf not in configs:
+                configs.append(conf)
 
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 7badacee1a7dd..dc1dbb9040020 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import collections
+import contextlib
 import dataclasses
 import functools
 import inspect
@@ -19,9 +20,11 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Iterator, Sequence
     from types import ModuleType
 
+import weakref
+
 import sympy
 
 import torch
@@ -91,6 +94,28 @@
 _P = ParamSpec("_P")
 
 
+_custom_should_partition_fns: weakref.WeakKeyDictionary[
+    torch._ops.OpOverload, Callable[..., bool]
+] = weakref.WeakKeyDictionary()
+
+
+def register_should_partition_rule(
+    op: torch._ops.OpOverload,
+    func: Callable[..., bool],
+) -> None:
+    """Register a function that says if Inductor should partition the graph on this op.
+
+    The function should be have the same signature as the operator.
+    Inductor will invoke the function with FakeTensors when it needs to decide
+    if the graph should be partitioned.
+
+    `register_should_partition_rule` is currently private and experimental.
+    Use at your own risk.
+    """
+    assert isinstance(op, torch._ops.OpOverload)
+    _custom_should_partition_fns[op] = func
+
+
 @dataclasses.dataclass
 class SchedulerBuffer:
     scheduler: Scheduler
@@ -1087,7 +1112,11 @@ def _prune_redundant_deps(
     def should_prune(dep: Dep) -> bool:
         if isinstance(dep, WeakDep):
             op_name = name_to_buf[dep.name].defining_op_name()
-            is_redundant = name_to_dep_count[name_to_fused_node[op_name].get_name()] > 0
+            is_redundant = name_to_dep_count[
+                name_to_fused_node[op_name].get_name()
+            ] > 0 and node.scheduler.fusable_weak_dep(
+                dep, name_to_fused_node[op_name], node
+            )
             # These can occur because fused nodes always gather deps from their snodes
             # If B has a weakdep on A
             # B gets fused with C, then any time BC is fused, the weakdep will reappear
@@ -2248,6 +2277,9 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         for node in self.nodes:
             node.prune_deps()
 
+        # See [Note: Graph Partition Device Contexts]
+        self.default_device_context: Optional[torch.device] = None
+
         self.name_to_donated_buffer: dict[str, SchedulerDonatedBuffer] = (
             self.get_donated_buffers()
         )
@@ -2974,7 +3006,7 @@ def fuse_nodes(self, nodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
                     i + 1,
                     old_len,
                 )
-                nodes = self.fuse_nodes_once(nodes)
+                nodes = self.fuse_nodes_once(nodes, is_reorder_round=False)
                 new_len = len(nodes)
                 fusion_log.debug(
                     "completed fusion round (%d/10): fused %d nodes into %d nodes\n",
@@ -2987,6 +3019,9 @@ def fuse_nodes(self, nodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
                         "===== fusion complete (%d iterations) =====", i + 1
                     )
                     break
+
+            if config.loop_ordering_after_fusion:
+                nodes = self.fuse_nodes_once(nodes, is_reorder_round=True)
             return nodes
 
     def process_grouped_nodes(self) -> None:
@@ -3493,7 +3528,9 @@ def get_fused_node(self, node: BaseSchedulerNode) -> BaseSchedulerNode:
         return self.name_to_fused_node[node.get_first_name()]
 
     def fuse_nodes_once(
-        self, nodes: list[BaseSchedulerNode]
+        self,
+        nodes: list[BaseSchedulerNode],
+        is_reorder_round: bool,
     ) -> list[BaseSchedulerNode]:
         """
         Combine eligible nodes into FusedSchedulerNodes.
@@ -3502,6 +3539,7 @@ def fuse_nodes_once(
             - self.can_fuse(): checks if a fusion is legal
             - self.score_fusion(): assigns priority to a given fusion
         """
+        self.prune_redundant_deps(nodes)
         fused_nodes = OrderedSet(nodes)
         if fusion_log.isEnabledFor(logging.DEBUG):
             fusion_log.debug("fuse_nodes_once, candidates:")
@@ -3556,7 +3594,7 @@ def resolve_pending_fusions(
 
                 fuse_two_nodes(node_key1, node_key2)
 
-        for node1, node2 in self.get_possible_fusions(nodes):
+        for node1, node2 in self.get_possible_fusions(nodes, is_reorder_round):
             # if either node is in a pending fusion, resolve it.
             # since we iterate on potential fusions based on profitability
             # the first potential fusion should take precedence.
@@ -3564,9 +3602,9 @@ def resolve_pending_fusions(
             node1 = self.get_fused_node(node1)
             node2 = self.get_fused_node(node2)
 
-            if self.can_fuse(node1, node2) and not self.will_fusion_create_cycle(
-                node1, node2
-            ):
+            if self.can_fuse(
+                node1, node2, is_reorder_round
+            ) and not self.will_fusion_create_cycle(node1, node2):
                 speedup = self.speedup_by_fusion(node1, node2)
                 if callable(speedup):
                     pending_fusions[node1] = (speedup, node1, node2)
@@ -3595,7 +3633,6 @@ def resolve_pending_fusions(
 
         nodes = sorted(fused_nodes, key=lambda x: x.min_order)
         nodes = self.topological_sort_schedule(nodes)
-        self.prune_redundant_deps(nodes)
         return nodes
 
     def create_combo_kernel_nodes(self, num_ck_nodes: Optional[int] = None) -> None:
@@ -3651,7 +3688,9 @@ def prune_redundant_deps(self, nodes: list[BaseSchedulerNode]) -> None:
             node.prune_redundant_deps(self.name_to_fused_node)
 
     def get_possible_fusions(
-        self, nodes: list[BaseSchedulerNode]
+        self,
+        nodes: list[BaseSchedulerNode],
+        is_reorder_round: bool,
     ) -> list[tuple[BaseSchedulerNode, BaseSchedulerNode]]:
         """
         Helper to find all legal fusion opportunities, sorted by self.score_fusion()
@@ -3671,10 +3710,10 @@ def check_all_pairs(nodes: list[BaseSchedulerNode]) -> None:
                         continue
                     seen.add(key)
 
-                    if self.can_fuse(node1, node2):
+                    if self.can_fuse(node1, node2, is_reorder_round):
                         possible_fusions.append(key)
                     elif (node2.is_template() or node2.is_foreach()) and self.can_fuse(
-                        node2, node1
+                        node2, node1, is_reorder_round
                     ):
                         # foreach fusions and epilogue fusions are order dependent
                         possible_fusions.append((node2, node1))
@@ -3807,14 +3846,6 @@ def _find_single_user_inputs(
             return True
         return False
 
-    def fusion_accumulate_large_reads(
-        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode, threshold: int
-    ) -> bool:
-        all_reads = (node1.read_writes.reads | node2.read_writes.reads) - (
-            node1.read_writes.writes | node2.read_writes.writes
-        )
-        return sum(self.dep_size_hint(dep) for dep in all_reads) > threshold
-
     def are_long_distant_nodes(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
     ) -> bool:
@@ -4152,7 +4183,12 @@ def has_reusable_buffer(node: BaseSchedulerNode) -> bool:
         else:
             return None
 
-    def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
+    def can_fuse(
+        self,
+        node1: BaseSchedulerNode,
+        node2: BaseSchedulerNode,
+        can_reorder: bool = False,
+    ) -> bool:
         """
         Determine if it is possible to combine node1 and node2 into a
         single fused node.
@@ -4269,7 +4305,8 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
 
         shared_data_score = self.score_fusion_memory(node1, node2)
         if (
-            shared_data_score < config.score_fusion_memory_threshold
+            can_reorder
+            and shared_data_score < config.score_fusion_memory_threshold
             and config.loop_ordering_after_fusion
         ):
             new_shared_data_score = self.shared_data_after_reordering_loop(node1, node2)
@@ -4375,22 +4412,36 @@ def fusable_weak_dep(
         if len(mutating_writes) != 1:
             return False
         write = mutating_writes[0]
+        if isinstance(write, StarDep):
+            return False
         assert isinstance(write, MemoryDep)
 
         if free_symbol_is_type(write.index, SymT.TMP):
             return False
 
         real_name = self.mutation_real_name[weak_dep.mutating_buf]
-        relevant_reads = [
-            read for read in node1.read_writes.reads if read.name == real_name
-        ]
-        return all(
-            isinstance(read, MemoryDep)
-            and not free_symbol_is_type(read.index, SymT.TMP)
-            and read.index == write.index
-            and read.size == write.size
-            for read in relevant_reads
-        )
+        relevant_reading_nodes = [node1]
+        if isinstance(node1, ForeachKernelSchedulerNode):
+            relevant_reading_nodes = node1.snodes
+        num_concurrent_reads = 0
+        for reading_node in relevant_reading_nodes:
+            relevant_reads = [
+                read
+                for read in reading_node.read_writes.reads
+                if read.name == real_name
+            ]
+            if not relevant_reads:
+                continue
+            num_concurrent_reads += 1
+            if not all(
+                isinstance(read, MemoryDep)
+                and not free_symbol_is_type(read.index, SymT.TMP)
+                and read.index == write.index
+                and read.size == write.size
+                for read in relevant_reads
+            ):
+                return False
+        return num_concurrent_reads <= 1
 
     # StarDep doesn't match MemoryDep, different indices don't match
     # However, broadcasting sometimes strips dimensions, and if that's the case
@@ -4623,6 +4674,25 @@ def should_partition(
     ) -> bool:
         """Return True if we should partition the inductor graph on this node"""
 
+        # Allow users to manually specify if a node should be partitioned
+        # Can only do this for FallbackKernels
+        ir_node = node.node
+        if isinstance(ir_node, torch._inductor.ir.FallbackKernel):
+            operator = ir_node.op_overload
+            if operator is not None and operator in _custom_should_partition_fns:
+                assert isinstance(operator, torch._ops.OpOverload)
+                should_partition_fn = _custom_should_partition_fns[operator]
+                fx_node = ir_node.get_origin_node()
+                assert fx_node is not None
+                success, fake_args, fake_kwargs = (
+                    torch._inductor.fx_utils.get_fake_args_kwargs(fx_node)
+                )
+                assert success, (
+                    "If this op came from a custom inductor pass, make sure to run FakeTensorUpdator"
+                )
+                should_partition = should_partition_fn(*fake_args, **fake_kwargs)
+                return should_partition
+
         # When not using cudagraphs, keep all kernels in the `call` function
         # instead of graph partition functions, since graph partition only brings
         # benefit to cudagraph
@@ -5193,15 +5263,90 @@ def _codegen_partition_wrapper(
             V.graph.wrapper_code.partition_signatures = signature
             V.graph.wrapper_code.write_prefix()
 
+            graph_name = V.graph.name
             partition_code, _ = V.graph.wrapper_code.generate(V.graph.is_inference)
 
-        V.graph.wrapper_code.define_subgraph_launcher_fn(partition_code.value)
+        V.graph.wrapper_code.define_subgraph_launcher_fn(graph_name, partition_code)
 
         V.graph.wrapper_code.codegen_partition_call(graph_partition_id, signature)
         V.graph.wrapper_code.allocated.update(  # type: ignore[has-type]
             [node.get_name() for node in signature.output_nodes]
         )
 
+    def use_default_device_context(
+        self, partitions: list[PartitionType], signatures: list[GraphPartitionSignature]
+    ) -> contextlib.AbstractContextManager[None]:
+        @contextlib.contextmanager
+        def ctx() -> Iterator[None]:
+            self.update_graph_partition_default_device(partitions, signatures)
+            if self.default_device_context and device_need_guard(
+                self.default_device_context.type
+            ):
+                assert self.default_device_context.index is not None, (
+                    "device should have an index"
+                )
+                V.graph.wrapper_code.codegen_device_guard_enter(
+                    self.default_device_context.index
+                )
+
+            try:
+                yield
+            finally:
+                if self.default_device_context and device_need_guard(
+                    self.default_device_context.type
+                ):
+                    V.graph.wrapper_code.codegen_device_guard_exit()
+                self.default_device_context = None
+
+        return ctx()
+
+    def update_graph_partition_default_device(
+        self, partitions: list[PartitionType], signatures: list[GraphPartitionSignature]
+    ) -> None:
+        # Note: [Graph Partition Device Contexts]
+        # Entering a device context takes 60 microseconds and exiting a device
+        # context takes 20 microseconds. If all graph partitions and
+        # cudagraph-unsafe ops happen on the same device, we can share the
+        # device context.
+
+        if len(partitions) == 1 and not signatures[0].skip_cudagraph:
+            # If there is only 1 cudagraph partition, the device context
+            # should happen within the cudagraph partition, which
+            # would be removed by cudagraph.
+            return
+
+        def get_cudagraph_partition_device(partition: PartitionType) -> torch.device:
+            partition_device = partition[0].get_device()
+            assert partition_device is not None
+            return partition_device
+
+        def all_on_target_device(
+            partition: PartitionType, target_device: torch.device
+        ) -> bool:
+            for node in partition:
+                device = node.get_device()
+                if device != target_device:
+                    return False
+            return True
+
+        cudagraph_partition_device = None
+        for partition, signature in zip(partitions, signatures):
+            if not signature.skip_cudagraph:
+                cudagraph_partition_device = get_cudagraph_partition_device(partition)
+                break
+
+        # all partitions skip cudagraph
+        if cudagraph_partition_device is None:
+            return
+
+        for partition, signature in zip(partitions, signatures):
+            if signature.skip_cudagraph and not all_on_target_device(
+                partition, cudagraph_partition_device
+            ):
+                return
+
+        self.default_device_context = cudagraph_partition_device
+
     def _codegen_partitions(self) -> None:
         """
         Split nodes into partitions and codegen each partition into separate functions.
@@ -5214,15 +5359,16 @@ def _codegen_partitions(self) -> None:
             msg = f"cudagraph partition into {len(partitions)} partitions"
             maybe_log_cudagraph_partition(msg=msg, prefix="")
 
-        for partition, signature in zip(partitions, signatures):
-            assert len(partition) >= 1, (
-                f"Each partition must have at least one node but found {len(partition)}"
-            )
+        with self.use_default_device_context(partitions, signatures):
+            for partition, signature in zip(partitions, signatures):
+                assert len(partition) >= 1, (
+                    f"Each partition must have at least one node but found {len(partition)}"
+                )
 
-            if signature.skip_cudagraph:
-                self._codegen(partition)
-            else:
-                self._codegen_partition_wrapper(partition, signature)
+                if signature.skip_cudagraph:
+                    self._codegen(partition)
+                else:
+                    self._codegen_partition_wrapper(partition, signature)
 
         num_partitions = next(self._graph_partition_counter)
         V.graph.wrapper_code.set_all_partition_names(num_partitions)
@@ -5255,7 +5401,11 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 )
                 seen.add(key)
 
-        self.current_device = None
+        self.current_device = self.default_device_context
+
+        if self.default_device_context and config.triton.autotune_at_compile_time:
+            V.graph.wrapper_code.write_get_raw_stream_header()
+
         for node in nodes:
             if log.isEnabledFor(logging.DEBUG):
                 try:
@@ -5334,10 +5484,15 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 ):
                     self.flush()
 
-        if self.current_device and device_need_guard(self.current_device.type):
-            # exit the outermost CUDA device guard. this is
-            # important for nested indentation codegen-ing.
-            V.graph.wrapper_code.codegen_device_guard_exit()
+        if self.current_device != self.default_device_context:
+            # when default_device_context is not None, we are codegen
+            # for graph partitions and all nodes must be on
+            # the same default device.
+            assert self.current_device is not None
+            if device_need_guard(self.current_device.type):
+                # exit the outermost CUDA device guard. this is
+                # important for nested indentation codegen-ing.
+                V.graph.wrapper_code.codegen_device_guard_exit()
 
         self.flush()
 
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index ac8daee16417a..9c1602acc5baf 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -2,6 +2,7 @@
 import contextlib
 import dataclasses
 import functools
+import hashlib
 import inspect
 import itertools
 import json
@@ -106,6 +107,8 @@
 
     from torch._inductor.codegen.simd import IterationRangesRoot
 
+    from .codegen.common import CSE
+
 
 class KernelNamespace:
     pass
@@ -254,19 +257,20 @@ def finalize_all(self) -> str:
 class SubgraphInfo:
     body: IndentedBuffer
     template_mask: Optional[str] = None
-    template_out: Optional[str] = None
+    template_out_shape: Optional[Union[str, tuple[str]]] = None
     compute: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     indexing_code: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     loads: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     stores: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     ops_handler: Optional[V.WrapperHandler] = None  # type: ignore[name-defined]
+    cse: Optional["CSE[Any]"] = None
 
     # only copied over if not None
     range_trees: Optional[list["IterationRangesRoot"]] = None
     numels: Optional[dict[str, sympy.Expr]] = None
 
     def __post_init__(self):
-        self.only_copy_if_non_none_fields = ("range_trees", "numels")
+        self.only_copy_if_non_none_fields = ("range_trees", "numels", "cse")
 
     def to_dict(self):
         return {
@@ -366,7 +370,7 @@ class TritonTemplateKernel(TritonKernel):
     def __init__(
         self,
         kernel_name,
-        input_nodes,
+        input_nodes: tuple[ir.IRNode],
         output_node,
         defines,
         num_stages,
@@ -377,6 +381,7 @@ def __init__(
         num_consumer_groups=0,
         num_buffers_warp_spec=0,
         use_jit=False,
+        tma_store=False,
         prefix_args=0,
         suffix_args=0,
         epilogue_fn=identity,
@@ -385,12 +390,25 @@ def __init__(
         prologue_loads_all_inputs=False,
         hint_override: Optional[int] = None,
     ) -> None:
+        if tma_store:
+            pass
         numel = sympy_product(output_node.get_size())
-        super().__init__(
-            {
+        if tma_store:
+            assert len(output_node.get_size()) == 2, (
+                "TMA store only supported for 2D with templates"
+            )
+            tiling = {
+                "x": output_node.get_size()[0],
+                "y": output_node.get_size()[1],
+                "r0_": sympy.S.One,
+            }
+        else:
+            tiling = {
                 "x": numel,
                 "r0_": sympy.S.One,
-            },
+            }
+        super().__init__(
+            tiling,
             features=SIMDKernelFeatures([], numel),
             hint_override=hint_override,
         )
@@ -400,6 +418,7 @@ def __init__(
         self.defines = defines
         self.kernel_name = kernel_name
         self.use_jit = use_jit
+        self.tma_store = tma_store
         self.num_stages = num_stages
         self.num_warps = num_warps
         self.num_consumer_groups = num_consumer_groups
@@ -445,7 +464,7 @@ def __init__(
         self.loads: IndentedBuffer = FakeIndentedBuffer()
         self.stores: IndentedBuffer = FakeIndentedBuffer()
         self.template_mask: Optional[str] = None
-        self.template_out: Optional[str] = None
+        self.template_out_shape: Optional[Union[str, tuple[str]]] = None
         self.ops_handler: Optional[V.WrapperHandler] = None  # type: ignore[name-defined]
 
         # When caching is enabled, the generated code is not dependent on the input nodes names, or
@@ -468,6 +487,12 @@ def __init__(
         # Extra functions to be exposed during partial template rendering.
         self.extra_template_env_fns: list[Callable[..., Any]] = []
 
+        # Tracking for intermediate variables
+        self.tmp_var_ctr = itertools.count()
+
+    def _gen_tmp_var(self) -> str:
+        return f"_tmp_var{next(self.tmp_var_ctr)}"
+
     def input_dependent_preserved_state(self) -> str:
         # Not adding self.args.output_buffers on purpose. But we do not need to reproduce it on a cache hit.
         # (never accessed).
@@ -535,12 +560,10 @@ def set_subgraph_body(self, body_name: str):
             setattr(self, key, value)
 
     @contextlib.contextmanager
-    def create_subgraph_body(self, body_name: str):
+    def create_subgraph_body(self, body_name: str, clear_cse: bool = False):
         assert body_name not in self.subgraph_bodies
         self.subgraph_bodies[body_name] = SubgraphInfo(
-            IndentedBuffer(),
-            None,
-            None,
+            IndentedBuffer(), None, None, cse=self.cse.clone() if clear_cse else None
         )
         with self.set_subgraph_body(body_name):
             yield
@@ -717,11 +740,12 @@ def hook():
             with code.indent():
                 code.splice(self.defines)
                 code.splice(renames.getvalue())
+                self.codegen_prologue(code)
             return code.getvalue()
 
         return self._register_hook("<DEF_KERNEL>", hook)
 
-    def size(self, name: str, index: int):
+    def size(self, name: Optional[str], index: int):
         """
         Hook called from template code to get the size of an arg.
         Will add needed args to pass it in if it is dynamic.
@@ -841,6 +865,7 @@ def load_input(
         mask: Optional[str] = None,
         other: Optional[Union[float, int]] = 0.0,
         indent_width: int = 4,
+        index_shape: Optional[tuple[str]] = None,
     ):
         """Loads an input and applies any necessary preprocessing or masking.
 
@@ -918,7 +943,7 @@ def load_input(
             # We are using "None" for clarity in output code, but
             # we could alternatively emit `xmask = tl.full([xindex.shape], True, tl.int1)`
             self.template_mask = mask if mask is not None else "None"
-            self.template_out = "xindex"
+            self.template_out_shape = index_shape if index_shape else "xindex"
             self.template_indices = indices
             self.named_input_nodes[input_name].data.freeze_layout()
             self.cse.invalidate(OrderedSet())
@@ -981,7 +1006,7 @@ def store(
             else:
                 out_indexing = self.indexing(
                     output_index,
-                    copy_shape=self.template_out,
+                    copy_shape=self.template_out_shape,
                     override_mask=self.template_mask,
                 )
                 from .codegen.triton import IndexingOptions
@@ -1014,13 +1039,84 @@ def hook():
 
         return self._register_hook(hook_key, hook)
 
+    def _generate_index_from_tma_index(
+        self,
+        output_name: str,
+        offset_name: str,
+        tma_index: sympy.Symbol,
+        block_size: str,
+        dim: int,
+        num_dims: int,
+        block_name: Optional[str] = None,
+    ) -> list[str]:
+        """
+        Generate the logic to compute the regular tl.load index from the provided
+        tma index. This is used to ensure variables can support fusions.
+
+        Args:
+            output_name (str): The output variable name.
+            offset_name (str): The name used for the intermediate offset.
+            tma_index (sympy.Symbol): The symbol used for the original TMA index.
+            block_size (str): The block size of the index.
+            dim (int): Which dimension to project the index in.
+            num_dims (int): The total number of dimensions in the output.
+            block_name (Optional[str]): The name of the block variable. If not passed
+                in then we aren't reusing standard symbol names.
+
+        Returns:
+            list[str]: The lines used to generate the index.
+
+        """
+        if block_name:
+            # Generate the expected names for the structure:
+            # XBLOCK/YBLOCK and xoffset/yoffset. We append XBLOCK/YBLOCK
+            # to the top of the kernel so we can safely extract the tensor
+            # descriptor construction to the top of the kernel.
+            if block_name in self.prologue_cache:
+                assert self.prologue_cache[block_name] == block_size, (
+                    f"Constant {block_name} must be used for all stores"
+                )
+            else:
+                self.prologue_cache[block_name] = block_size
+                self.prologue.writeline(f"{block_name}: tl.constexpr = {block_size}")
+        else:
+            block_name = block_size
+        line0 = f"{offset_name} = {texpr(tma_index)}"
+        expr = f"({offset_name} + tl.arange(0, {block_name}))"
+        prefix_none = "".join(["None, "] * dim)
+        suffix_none = ", ".join(["None"] * (num_dims - (dim + 1)))
+        line1 = f"{output_name} = {expr}[{prefix_none}:, {suffix_none}]"
+        return [line0, line1]
+
+    def _generated_mask_for_tma(
+        self,
+        index_name: str,
+        shape_val: str,
+        output_name: str,
+    ) -> str:
+        """
+        Generate the mask logic to feed to fusions for mask. The expectation
+        is that if we have X/Y there will be a variable named xmask and ymask.
+
+        Args:
+            index_name (str): The index used in the mask. Should be one of
+                xindex or yindex.
+            shape_val (str): The expression for the upper bound shape.
+            output_name (str): The expression used for the output.
+
+        Returns:
+            str: The mask generation line.
+        """
+        return f"{output_name} = {index_name} < {shape_val}"
+
     def store_output(
         self,
         indices: Union[list[Any], tuple[Any]],
         val: str,
         mask: Optional[str] = None,
         indent_width: int = 4,
-        val_shape: Optional[list[str]] = None,
+        val_shape: Optional[tuple[str]] = None,
+        block_indexing: bool = False,
     ):
         """Stores the final output and appends any epilogue fusions if the buffer hasn't been optimized away.
 
@@ -1032,11 +1128,18 @@ def store_output(
                 will be applied to the store.
             indent_width (int): The number of spaces to use for indentation. This is used when the call to
                 store_output is indented in the kernel definition.
+            block_indexing (bool): Are the input indices presented as offsets for creating the block (e.g.
+                inputs to TMA) or are they tensors that should be passed in directly.
         """
-        with self.create_subgraph_body("<STORE_OUTPUT>"):
+        subgraph_name = self._get_store_output_subgraph_name(
+            next(self.store_output_ctr)
+        )
+        with self.create_subgraph_body(subgraph_name, clear_cse=True):
             assert isinstance(indices, (list, tuple))
             assert isinstance(val, str)
             assert isinstance(mask, (str, type(None)))
+            assert isinstance(val_shape, (tuple, type(None)))
+            assert isinstance(block_indexing, bool)
             assert self.template_mask is None
             indices = list(map(OpOverrides.paren, indices))
             index_symbols = [sympy.Symbol(x, integer=True) for x in indices]
@@ -1045,27 +1148,136 @@ def store_output(
             ]
             assert len(indices) == len(lengths)
 
-            # glue to make generated code use same indexing from template
-            for name, range_tree_entry in zip(
-                indices, self.range_trees[0].construct_entries(lengths)
-            ):
-                range_tree_entry.set_name(name)
-            contiguous_index = sympy_dot(
-                ir.FlexibleLayout.contiguous_strides(lengths), index_symbols
-            )
-            contiguous_index = self.rename_indexing(contiguous_index)
-            self.body.writeline("xindex = " + texpr(contiguous_index))
-            self.range_trees[0].lookup(sympy.S.One, sympy_product(lengths)).set_name(
-                "xindex"
-            )
-            self.template_mask = mask
+            output_layout = self.output_node.get_layout()
             self.template_out = val
-            self.template_indices = indices
-            output_index = self.output_node.get_layout().make_indexer()(index_symbols)
-            output_index = self.rename_indexing(output_index)
-            if output_index == contiguous_index:
-                output_index = sympy.Symbol("xindex", integer=True)
+            if block_indexing:
+                assert val_shape, "Blocking indexing requires passing in val_shape"
+                assert len(val_shape) == 2, (
+                    "Blocking indexing only supports 2D data at this time"
+                )
+                assert not mask, "Mask is not supported with blocking indexing"
+                intermediate_lines: list[str] = []
+                epilogue_index_symbols: list[sympy.Symbol] = []
+                if self.tma_store:
+                    # Generate the expected indexing symbols.
+                    # Note: TMA indices are expected to be in the
+                    # format (x, y), but the range_tree is always
+                    # (yindex, xindex).
+                    index_order = [1, 0]
+                    val_shape_copy = list(val_shape)
+                    for i, range_tree in zip(index_order, self.range_trees[:-1]):
+                        name = range_tree.name
+                        symbol = range_tree.symbol()
+                        epilogue_index_symbols.append(symbol)
+                        lookup_output = range_tree.lookup(sympy.S.One, lengths[i])
+                        old_name = lookup_output.symbol()
+                        lookup_output.set_name(name)
+                        # Update var_list and var_range
+                        range_tree.var_list[range_tree.var_list.index(old_name)] = (
+                            symbol
+                        )
+                        range_val = range_tree.var_ranges[old_name]
+                        del range_tree.var_ranges[old_name]
+                        range_tree.var_ranges[symbol] = range_val
+                        intermediate_lines.extend(
+                            self._generate_index_from_tma_index(
+                                name,
+                                "xoffset" if name == "xindex" else "yoffset",
+                                index_symbols[i],
+                                val_shape[i],
+                                i,
+                                len(index_order),
+                                block_name=range_tree.symt.name,
+                            )
+                        )
+                        # Generate the xmask and ymask
+                        intermediate_lines.append(
+                            self._generated_mask_for_tma(
+                                name,
+                                self.size(None, i),
+                                "xmask" if name == "xindex" else "ymask",
+                            )
+                        )
+                        # Update the val_shape information to use consistent naming
+                        # after the remapping.
+                        val_shape_copy[i] = range_tree.symt.name
+                    # Reverse the index symbols because TMA is indexed
+                    # as (x, y) whereas the variables will naturally be indexed
+                    # as (y, x)
+                    epilogue_index_symbols.reverse()
+                    val_shape = tuple(val_shape_copy)
+                else:
+                    mask_vars: list[str] = []
+                    for i, (index, shape) in enumerate(zip(index_symbols, val_shape)):
+                        index_name = self._gen_tmp_var()
+                        offset_name = self._gen_tmp_var()
+                        intermediate_lines.extend(
+                            self._generate_index_from_tma_index(
+                                index_name,
+                                offset_name,
+                                index,
+                                shape,
+                                i,
+                                len(index_symbols),
+                            )
+                        )
+                        epilogue_index_symbols.append(
+                            sympy.Symbol(index_name, integer=True)
+                        )
+                        mask_name = self._gen_tmp_var()
+                        intermediate_lines.append(
+                            self._generated_mask_for_tma(
+                                index_name,
+                                self.size(None, i),
+                                mask_name,
+                            )
+                        )
+                        mask_vars.append(mask_name)
+                    final_mask_var = self._gen_tmp_var()
+                    final_mask_rhs = " & ".join(
+                        f"{mask_name}" for mask_name in mask_vars
+                    )
+                    intermediate_lines.append(f"{final_mask_var} = {final_mask_rhs}")
+                    self.template_mask = final_mask_var
+                index_symbols = epilogue_index_symbols
+                contiguous_index = sympy_dot(output_layout.stride, index_symbols)
+                if not self.tma_store:
+                    # Convert to just use xindex.
+                    contiguous_index = self.rename_indexing(contiguous_index)
+                    intermediate_lines.append(f"xindex = {texpr(contiguous_index)}")
+                    self.range_trees[0].lookup(
+                        sympy.S.One, sympy_product(lengths)
+                    ).set_name("xindex")
+                index_symbols = epilogue_index_symbols
+                output_index = contiguous_index
+                # Write out the intermediate lines
+                for line in intermediate_lines:
+                    self.body.writeline(line)
+            else:
+                assert not self.tma_store, "TMA store requires block indexing"
+                # glue to make generated code use same indexing from template
+                for name, range_tree_entry in zip(
+                    indices, self.range_trees[0].construct_entries(lengths)
+                ):
+                    range_tree_entry.set_name(name)
+                contiguous_index = sympy_dot(
+                    ir.FlexibleLayout.contiguous_strides(lengths), index_symbols
+                )
+                contiguous_index = self.rename_indexing(contiguous_index)
+                self.body.writeline("xindex = " + texpr(contiguous_index))
+                self.range_trees[0].lookup(
+                    sympy.S.One, sympy_product(lengths)
+                ).set_name("xindex")
+                self.template_mask = mask
+                self.template_indices = indices
+                output_index = self.output_node.get_layout().make_indexer()(
+                    index_symbols
+                )
+                output_index = self.rename_indexing(output_index)
+                if output_index == contiguous_index:
+                    output_index = sympy.Symbol("xindex", integer=True)
 
+            self.template_out_shape = val_shape if val_shape else val
             acc_dtype = (
                 triton_type_to_torch(self.meta["ACC_TYPE"])
                 if "ACC_TYPE" in self.meta
@@ -1079,7 +1291,13 @@ def store_output(
                 self.input_nodes[len(self.input_nodes) - self.suffix_args :],
             ):
                 input_node.freeze_layout()
-                epilogue_args.append(input_node.make_loader()(index_symbols))
+                epilogue_arg = V.kernel.cse.generate(
+                    self.compute,
+                    input_node.make_loader()(index_symbols),
+                    dtype=acc_dtype,
+                    shape=input_node.get_size(),
+                )
+                epilogue_args.append(epilogue_arg)
                 # We update frozen_layouts_cnt in order to replay this function on a cache hit.
                 self.frozen_layouts_cnt += 1
 
@@ -1087,17 +1305,19 @@ def store_output(
                 self.output_node.get_name(),
                 output_index,
                 self.epilogue_fn(*epilogue_args),
+                mode="tma" if self.tma_store else None,
             )
             self.codegen_body()
 
         def hook():
-            # more stuff might have been added since the codegen_body above
-            self.codegen_body()
-            self.cse.invalidate(OrderedSet())
+            with self.set_subgraph_body(subgraph_name):
+                # more stuff might have been added since the codegen_body above
+                self.codegen_body()
+                self.cse.invalidate(OrderedSet())
 
-            return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
+                return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
 
-        return self._register_hook("<STORE_OUTPUT>", hook)
+        return self._register_hook(subgraph_name, hook)
 
     def _register_hook(
         self,
@@ -1147,9 +1367,11 @@ def render(self, template, kwargs, record_input_dependent_tracked_event=False):
             self.cached_replay_events = []
 
         template_env = {
-            fn.__name__: self.record_input_dependent_tracked_event()(fn)
-            if record_input_dependent_tracked_event
-            else fn
+            fn.__name__: (
+                self.record_input_dependent_tracked_event()(fn)
+                if record_input_dependent_tracked_event
+                else fn
+            )
             for fn in [
                 self.def_kernel,
                 self.size,
@@ -1203,7 +1425,7 @@ def indexing(
             dense_indexing=False,
             # We pass template_out as the shape to broadcast the indexing to as
             # the mask might be broadcast to the output shape
-            copy_shape=self.template_out,
+            copy_shape=self.template_out_shape,
             override_mask=self.template_mask,
             block_ptr=block_ptr,
             tma_compatibility_checker=tma_compatibility_checker,
@@ -1322,6 +1544,7 @@ def make_key(
         suffix_args: int,
         epilogue_fn: Optional[Callable[..., Any]],
         epilogue_fn_hash: Optional[str],
+        tma_store: bool,
         subgraphs: Optional[list[ir.Buffer]],  # has to be none to cache
         workspace_arg: Optional[WorkspaceArg],  # has to be none to cache
         layout: ir.Layout,
@@ -1378,6 +1601,7 @@ def has_flexible_layout() -> bool:
                 "num_consumer_groups": num_consumer_groups,
                 "num_buffers_warp_spec": num_buffers_warp_spec,
                 "epilogue_fn_hash": epilogue_fn_hash,
+                "tma_store": tma_store,
                 "kwargs": kwargs,
                 "hint_override": hint_override,
             }
@@ -1426,7 +1650,7 @@ def __init__(
         cache_codegen_enabled_for_template=False,
         prologue_loads_all_inputs=False,
     ) -> None:
-        super().__init__(name)
+        super().__init__(name, hash=hashlib.sha256(source.encode("utf-8")).hexdigest())
         self.grid = grid
         self.template = self._template_from_string(source)
         assert name not in self.all_templates, "duplicate template name"
@@ -1492,6 +1716,7 @@ def generate_and_load(
         kwargs: dict[str, Any],
         generate_with_caching,
         hint_override: Optional[int] = None,
+        tma_store: bool = False,
     ) -> Optional[GenerateAndLoadResult]:
         """Generate the python code and load it into the current process"""
         caching_enabled = (
@@ -1510,6 +1735,7 @@ def generate_and_load(
                 suffix_args,
                 epilogue_fn,
                 epilogue_fn_hash,
+                tma_store,
                 subgraphs,
                 workspace_arg,
                 layout,
@@ -1569,6 +1795,7 @@ def make_kernel():
                 workspace_arg=workspace_arg,
                 use_jit=False,
                 hint_override=hint_override,
+                tma_store=tma_store,
                 **kernel_options,
             )
 
@@ -1596,8 +1823,7 @@ def make_extra() -> str:
 
             try:
                 template = kernel.render(self.template, kwargs, caching_enabled)
-                with kernel.set_subgraph_body("<STORE_OUTPUT>"):
-                    code = template.finalize_all()
+                code = template.finalize_all()
             except ZeroDivisionError:
                 # TODO(nmacchioni): fix sympy division by zero
                 return None
@@ -1689,6 +1915,7 @@ def generate(  # type: ignore[override]
         workspace_arg: Optional[WorkspaceArg] = None,
         generate_with_caching=False,
         hint_override: Optional[int] = None,
+        tma_store: bool = False,
         **kwargs,
     ):
         """This function generates a TritonTemplateCaller
@@ -1734,6 +1961,7 @@ def generate(  # type: ignore[override]
             kwargs,
             generate_with_caching and self._cache_codegen_enabled_for_template,
             hint_override=hint_override,
+            tma_store=tma_store,
         )
 
         # May happen as result of dev by 0.
@@ -1787,6 +2015,7 @@ def make_kernel_render(out_node, hint_override: Optional[int] = None):
                 workspace_arg=workspace_arg,
                 use_jit=False,
                 hint_override=hint_override,
+                tma_store=tma_store,
                 **options,
             )
             render = functools.partial(
@@ -1881,6 +2110,10 @@ def __init__(
         self.op_overload = op_overload
         self.use_fallback_kernel = use_fallback_kernel
         self.kernel_creator = kernel_creator
+        # match the API for KernelTemplate as they can be treated the same
+        # There is no src hash for ExternKernelChoice in the traditional sense
+        # so we indicate this by returning None
+        self.src_hash = None
 
     def to_callable(self):
         return getattr(extern_kernels, self.name)
@@ -2414,16 +2647,20 @@ def __call__(
             N = input_nodes[-1].get_size()[-1]
             append_to_log(mm_file_name, {"invoke": str((M, K, N))})
 
-        if len(choices) == 0:
+        def create_no_valid_choices(reason: str) -> NoValidChoicesError:
             backend_config = (
                 "max_autotune_gemm_backends"
                 if name != "convolution"
                 else "max_autotune_conv_backends"
             )
-            raise NoValidChoicesError(
-                f"No choices to select, please consider adding ATEN into {backend_config} "
+            return NoValidChoicesError(
+                f"No choices to select. Provided reason: {reason} "
+                f"please consider adding ATEN into {backend_config} "
                 "config (defined in torch/_inductor/config.py) to allow at least one choice. "
             )
+
+        if len(choices) == 0:
+            raise create_no_valid_choices("No choices exist for backend.")
         log.debug("Max autotune selects from %s choices.", str(len(choices)))
 
         if len(choices) == 1:
@@ -2480,6 +2717,12 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
                 precompile_fn()
             precompile_elapse = time.time() - precompile_start_ts
             log.debug("Precompilation elapsed time: %.02fs", precompile_elapse)
+            # Prune anything that failed to compile
+            choices = [c for c in choices if not c.failed]
+            if len(choices) == 0:
+                raise create_no_valid_choices(
+                    "All choices failed to compile for backend."
+                )
 
             candidates = self.prescreen_choices(
                 choices, name, inputs_key, self.prescreening_cache
@@ -2816,6 +3059,7 @@ def wait_on_futures():
                             futures[future],
                             exc_info=e,
                         )
+                        futures[future].mark_failed()
                     else:
                         log.exception(  # noqa: G202
                             "Exception %s for benchmark choice %s",
@@ -2823,6 +3067,7 @@ def wait_on_futures():
                             futures[future],
                             exc_info=e,
                         )
+                        futures[future].mark_failed()
                 else:
                     counters["inductor"]["select_algorithm_num_precompiles"] += 1
                     log.info(
@@ -2904,12 +3149,15 @@ def get_inputs(
             expected,
         )
 
+    @staticmethod
+    def _is_extern(choice: ChoiceCaller) -> bool:
+        return isinstance(choice, (ExternKernelCaller, SubgraphChoiceCaller))
+
     @classmethod
     def benchmark_choice(
         cls, choice: ChoiceCaller, autotune_args: AutotuneArgs
     ) -> float:
-        is_extern = isinstance(choice, (ExternKernelCaller, SubgraphChoiceCaller))
-        benchmark_tensors = autotune_args.get_benchmark_tensors(is_extern)
+        benchmark_tensors = autotune_args.get_benchmark_tensors(cls._is_extern(choice))
         inputs, output = benchmark_tensors.unpack()
         output.zero_()
         result = choice.benchmark(*inputs, out=output)
@@ -3016,8 +3264,8 @@ def benchmark_in_sub_process(
 
         # only benchmark triton kernel in sub process for now.
         # ATen/Extern kernel are still benchmarked in the current process.
-        extern = [c for c in choices if isinstance(c, ExternKernelCaller)]
-        triton = [c for c in choices if not isinstance(c, ExternKernelCaller)]
+        extern = [c for c in choices if cls._is_extern(c)]
+        triton = [c for c in choices if not cls._is_extern(c)]
 
         timings = cls.benchmark_in_current_process(
             extern, input_nodes, layout, input_gen_fns, hint_override=hint_override
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 8727777b562b2..985b31e60d690 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -70,10 +70,13 @@ class SizeVarAllocator:
 
     def __init__(self, shape_env=None) -> None:
         super().__init__()
+        # Note: this can lead to bugs. Reasoning APIs depends on existing information in
+        # in the shape_env. For example! var_to_ranges can't be empty!
         if shape_env is None:
             shape_env = ShapeEnv()
         self.shape_env = shape_env
         self.var_to_val = self.shape_env.var_to_val
+        self.var_to_hint_override = self.shape_env.var_to_hint_override
         self.replacements: dict[sympy.Symbol, Expr] = self.shape_env.replacements
         self.unbacked_replacements: Optional[dict[Expr, Expr]] = None
         # Maps of dynamic sizes that have to be precomputed on the host to the kernel args.
@@ -544,7 +547,13 @@ def remove_precomputed_replacements(self, expr: Expr) -> Expr:
         return expr
 
     def symbolic_hint(
-        self, expr: Union[Expr, int], hint_override: Optional[int] = None
+        self,
+        expr: Union[Expr, int],
+        hint_override: Optional[int] = None,
+        # Only flip this flag if you don't plan on guarding/adding runtime
+        # asserts based on this value and promise to only use this value
+        # in a heuristic nature.
+        use_user_provided_hint_override: bool = False,
     ) -> Union[Expr, int]:
         if isinstance(expr, int):
             return expr
@@ -564,6 +573,10 @@ def symbolic_hint(
             return hint_override
 
         expr = self.remove_precomputed_replacements(expr)
+
+        if use_user_provided_hint_override:
+            expr = sympy_subs(expr, self.var_to_hint_override)
+
         return sympy_subs(expr, self.var_to_val)
 
     def size_hint(
@@ -573,7 +586,11 @@ def size_hint(
         fallback: Optional[int] = None,
         hint_override: Optional[int] = None,
     ) -> int:
-        out = self.symbolic_hint(expr, hint_override=hint_override)
+        out = self.symbolic_hint(
+            expr,
+            hint_override=hint_override,
+            use_user_provided_hint_override=fallback is not None,
+        )
         if not isinstance(out, (int, sympy.Integer)) and fallback is not None:
             # Use the provided heuristic fallback hint
             unbacked_sym_vrs = {
@@ -610,7 +627,11 @@ def size_hints(
         hint_override: Optional[int] = None,
     ) -> tuple[int, ...]:
         return tuple(
-            self.size_hint(x, fallback=fallback, hint_override=hint_override)
+            self.size_hint(
+                x,
+                fallback=fallback,
+                hint_override=hint_override,
+            )
             for x in exprs
         )
 
diff --git a/torch/_inductor/template_heuristics/aten.py b/torch/_inductor/template_heuristics/aten.py
index 72e66b1c14765..103668aa056fa 100644
--- a/torch/_inductor/template_heuristics/aten.py
+++ b/torch/_inductor/template_heuristics/aten.py
@@ -5,7 +5,14 @@
 from torch._inductor import config as inductor_config
 
 from ..kernel.bmm import aten_baddbmm, aten_bmm, aten_bmm_dtype
-from ..kernel.mm import aten__fp8_mm, aten__int_mm, aten_addmm, aten_bias_addmm, aten_mm
+from ..kernel.mm import (
+    aten__fp8_mm,
+    aten__int_mm,
+    aten_addmm,
+    aten_bias_addmm,
+    aten_mm,
+    aten_mm_dtype,
+)
 from ..kernel.mm_plus_mm import aten_mm_plus_mm
 from .base import TemplateConfigHeuristics
 from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
@@ -21,6 +28,7 @@
 # These are all labeled as device type None to indicate that they
 # are valid for all device types
 @register_template_heuristic(aten_mm.uid, None)
+@register_template_heuristic(aten_mm_dtype.uid, "cuda")
 @register_template_heuristic(aten__fp8_mm.uid, None)
 @register_template_heuristic(aten__int_mm.uid, None)
 @register_template_heuristic(aten_bmm.uid, None)
diff --git a/torch/_inductor/template_heuristics/base.py b/torch/_inductor/template_heuristics/base.py
index def2a2f59bee5..0343270f3a111 100644
--- a/torch/_inductor/template_heuristics/base.py
+++ b/torch/_inductor/template_heuristics/base.py
@@ -2,6 +2,8 @@
 
 from typing import Any, TYPE_CHECKING
 
+from .params import DictKernelTemplateParams, KernelTemplateParams
+
 
 if TYPE_CHECKING:
     from collections.abc import Generator
@@ -27,7 +29,7 @@ def get_template_configs(
         self,
         kernel_inputs: KernelInputs,
         op_name: str,
-    ) -> Generator[dict[str, Any], None, None]:
+    ) -> Generator[KernelTemplateParams, None, None]:
         """
         Get template configs for the given inputs.
 
@@ -37,10 +39,10 @@ def get_template_configs(
         if not self.should_run(kernel_inputs):
             return
 
-        yield from self._get_template_configs_impl(
-            kernel_inputs,
-            op_name,
-        )
+        # Generate configs and fuse with extra_kwargs
+        for config_dict in self._get_template_configs_impl(kernel_inputs, op_name):
+            # Fuse extra_kwargs into config
+            yield DictKernelTemplateParams(config_dict)
 
     def _get_template_configs_impl(
         self,
diff --git a/torch/_inductor/template_heuristics/params.py b/torch/_inductor/template_heuristics/params.py
new file mode 100644
index 0000000000000..92b130217e3d1
--- /dev/null
+++ b/torch/_inductor/template_heuristics/params.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class KernelTemplateParams(ABC):
+    """Abstract base class for kernel template parameters."""
+
+    @abstractmethod
+    def to_kwargs(self) -> dict[str, Any]:
+        """Convert params to kwargs dict for template.choice_or_none()"""
+
+    @abstractmethod
+    def to_serializeable_dict(self) -> dict[str, Any]:
+        """Convert params to serializable dict for storage/caching"""
+
+    @classmethod
+    @abstractmethod
+    def from_dict(cls, data: dict[str, Any]) -> KernelTemplateParams:
+        """Create params instance from dict"""
+
+
+class DictKernelTemplateParams(KernelTemplateParams):
+    """Simple implementation that wraps a kwargs dict"""
+
+    # NOTE: this is a compatibility layer, until every template
+    # has time to define their own params class, with meaningful
+    # defaults etc.
+
+    def __init__(self, kwargs: dict[str, Any]):
+        self.kwargs = kwargs
+
+    def to_kwargs(self) -> dict[str, Any]:
+        return self.kwargs.copy()
+
+    def to_serializeable_dict(self) -> dict[str, Any]:
+        return self.kwargs.copy()
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> DictKernelTemplateParams:
+        return cls(data)
diff --git a/torch/_inductor/template_heuristics/triton.py b/torch/_inductor/template_heuristics/triton.py
index 0aaf70ae3f24d..506bbd6ccfe83 100644
--- a/torch/_inductor/template_heuristics/triton.py
+++ b/torch/_inductor/template_heuristics/triton.py
@@ -18,6 +18,7 @@
 from .. import config, config as inductor_config
 from ..kernel.bmm import bmm_template
 from ..kernel.mm import (
+    blackwell_ws_persistent_device_tma_mm_template,
     mm_template,
     persistent_tma_mm_template,
     scaled_mm_device_tma_template,
@@ -74,7 +75,8 @@ class GemmConfig(BaseConfig):
 class FlexConfig:
     """
     Base Config class for flex attention
-    - FlexAttn forward, backward and flex decode will use this
+    - FlexAttn forward and backward will use this. For flex decoding,
+      please use FlexDecodingConfig.
 
     NOTE:
     For flex_attn bwd block_m and block_n are reused for block_m1, block_m2, block_n1, block_n2
@@ -87,6 +89,39 @@ class FlexConfig:
     num_warps: int
 
 
+@dataclasses.dataclass
+class FlexBwDConfig:
+    """
+    Base Config class for flex attention backward
+    - FlexAttn backward will use this.
+
+    Note: flex bwd configs
+
+    Kernel Constraints:
+      * BLOCK_N1 % BLOCK_M1 == 0
+      * BLOCK_M2 % BLOCK_N2 == 0
+
+    Pattern 1 - Symmetric Pairing (M, N, N, M):
+    - Used in autotune configs
+    - block_m1=M, block_n1=N, block_m2=N, block_n2=M
+    - Only requires checking BLOCK_N % BLOCK_M == 0
+    - Second constraint (BLOCK_M2 % BLOCK_N2) automatically satisfied
+
+    Pattern 2 - Independent Parameters (M1, N1, M2, N2):
+    - Used in exhaustive search for maximum flexibility
+    - All four parameters can be set independently
+    - Requires checking both constraints
+
+    """
+
+    block_m1: int
+    block_n1: int
+    block_m2: int
+    block_n2: int
+    num_stages: int
+    num_warps: int
+
+
 @dataclasses.dataclass
 class FlexDecodeConfig:
     """
@@ -132,6 +167,17 @@ class ROCmFlexConfig(FlexConfig):
     kpack: int = 2
 
 
+@dataclasses.dataclass
+class ROCmFlexBwDConfig(FlexBwDConfig):
+    """
+    ROCm subclass for FlexAttn backward, with AMD backend specific tuneable kernargs
+    """
+
+    matrix_instr_nonkdim: int = 0
+    waves_per_eu: int = 0
+    kpack: int = 2
+
+
 @dataclasses.dataclass
 class ROCmFlexDecodeConfig(FlexDecodeConfig):
     """
@@ -258,6 +304,16 @@ def __init__(self) -> None:
             GemmConfig(128, 128, 64, 5, 4),
         ]
 
+        self.blackwell_persistent_mm_configs: list[BaseConfig] = [
+            GemmConfig(128, 256, 64, 4, 8),
+            GemmConfig(256, 128, 64, 3, 8),
+            GemmConfig(128, 256, 128, 2, 8),
+            GemmConfig(128, 256, 64, 3, 8),
+            GemmConfig(128, 128, 128, 3, 4),
+            GemmConfig(256, 128, 64, 3, 8),
+            GemmConfig(128, 128, 128, 3, 8),
+        ]
+
         self.scaled_mm_configs: list[BaseConfig] = [
             GemmConfig(128, 256, 32, 3, 8),
             GemmConfig(256, 128, 32, 3, 8),
@@ -356,6 +412,10 @@ def __init__(self) -> None:
             GemmConfig(32, 64, 64, 6, 2),
             GemmConfig(32, 128, 64, 6, 4),
             GemmConfig(32, 256, 64, 6, 4),
+            GemmConfig(64, 16, 256, 5, 4),
+            GemmConfig(64, 32, 256, 5, 4),
+            GemmConfig(64, 128, 128, 3, 4),
+            GemmConfig(128, 256, 128, 4, 8),
         ]
 
         self.scaled_persistent_mm_configs: list[BaseConfig] = [
@@ -368,6 +428,10 @@ def __init__(self) -> None:
             GemmConfig(128, 128, 128, 5, 8),
             GemmConfig(128, 128, 128, 6, 8),
             GemmConfig(128, 128, 64, 4, 8),
+            GemmConfig(64, 32, 256, 5, 4),
+            GemmConfig(128, 256, 128, 3, 8),
+            GemmConfig(64, 128, 256, 4, 4),
+            GemmConfig(64, 256, 128, 4, 4),
         ]
 
         # TODO: Unify with other gemm patterns, mm_plus_mm currently follows
@@ -403,13 +467,14 @@ def __init__(self) -> None:
             FlexConfig(64, 64, 3, 4),
         ]
 
-        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = [
-            FlexConfig(BLOCK1, BLOCK2, s, w)
-            for BLOCK1 in [32, 64]
-            for BLOCK2 in [32, 64, 128]
+        self.flex_attn_bwd_autotune_configs: list[FlexBwDConfig] = [
+            # See Note: flex bwd configs
+            FlexBwDConfig(BLOCK_M, BLOCK_N, BLOCK_N, BLOCK_M, s, w)
+            for BLOCK_M in [32, 64]
+            for BLOCK_N in [32, 64, 128]
             for s in [1, 3, 4, 5]  # num_stages
-            for w in ([4, 8] if BLOCK1 >= 128 or BLOCK2 >= 128 else [4])
-            if BLOCK2 % BLOCK1 == 0
+            for w in ([4, 8] if BLOCK_M >= 128 or BLOCK_N >= 128 else [4])
+            if BLOCK_N % BLOCK_M == 0
         ]
 
         self.flex_decode_autotune_configs: list[FlexDecodeConfig] = [
@@ -426,13 +491,17 @@ def __init__(self) -> None:
             for num_warps in [2, 4, 8]
         ]
 
-        self.exhaustive_flex_attn_bwd_configs: list[FlexConfig] = [
-            FlexConfig(BLOCK1, BLOCK2, num_stages, num_warps)
-            for BLOCK1 in [16, 32, 64, 128]
-            for BLOCK2 in [16, 32, 64, 128]
-            for num_stages in [1, 3, 4, 5]
+        self.exhaustive_flex_attn_bwd_configs: list[FlexBwDConfig] = [
+            # See Note: flex bwd configs
+            FlexBwDConfig(BLOCK_M1, BLOCK_N1, BLOCK_M2, BLOCK_N2, num_stages, num_warps)
+            for BLOCK_M1 in [16, 32, 64, 128]
+            for BLOCK_N1 in [16, 32, 64, 128]
+            for BLOCK_M2 in [16, 32, 64, 128]
+            for BLOCK_N2 in [16, 32, 64, 128]
+            for num_stages in [1, 3, 4]
             for num_warps in [2, 4, 8]
-            if BLOCK2 % BLOCK1 == 0
+            if BLOCK_N1 % BLOCK_M1 == 0
+            and BLOCK_M2 % BLOCK_N2 == 0  # kernel static assertions
         ]
 
         self.exhaustive_flex_decode_configs: list[FlexDecodeConfig] = [
@@ -705,15 +774,17 @@ def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
 
         return flex_attn_fwd_configs
 
-    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
-        flex_attn_bwd_configs: list[FlexConfig] = []
+    def get_flex_attn_bwd_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexBwDConfig]:
+        flex_attn_bwd_configs: list[FlexBwDConfig] = []
 
         if config.max_autotune:
             if config.max_autotune_flex_search_space == "EXHAUSTIVE":
                 return self.exhaustive_flex_attn_bwd_configs
             flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
 
-        default_config = FlexConfig(16, 16, 1, 4)
+        default_config = FlexBwDConfig(16, 16, 16, 16, 1, 4)
 
         if default_config not in flex_attn_bwd_configs:
             flex_attn_bwd_configs.append(default_config)
@@ -912,41 +983,57 @@ def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
 
         return flex_attn_fwd_configs
 
-    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+    def get_flex_attn_bwd_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexBwDConfig]:
         capability = torch.cuda.get_device_capability()
-
-        flex_attn_bwd_configs: list[FlexConfig] = []
+        flex_attn_bwd_configs: list[FlexBwDConfig] = []
 
         if config.max_autotune:
             if config.max_autotune_flex_search_space == "EXHAUSTIVE":
                 return self.exhaustive_flex_attn_bwd_configs
             flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
 
+        major, minor = capability
         if dtype == torch.float32:
-            default_config = FlexConfig(16, 16, 1, 4)
-        elif head_dim <= 256 and capability == (9, 0):  # H100
-            if head_dim == 64:
-                default_config = FlexConfig(64, 64, 3, 4)
-            elif head_dim == 128:
-                default_config = FlexConfig(64, 128, 3, 8)
-            else:
-                default_config = FlexConfig(64, 64, 2, 4)
-        elif head_dim <= 256 and capability >= (10, 0):  # B100
-            if head_dim == 64 or head_dim == 128:
-                default_config = FlexConfig(32, 32, 2, 4)
-            else:
-                default_config = FlexConfig(32, 32, 1, 4)
-        elif capability >= (8, 0):  # A100
-            if head_dim == 64:
-                default_config = FlexConfig(32, 128, 3, 4)
-            elif head_dim == 128:
-                # SM86/89 have smaller shared memory sizes
-                num_stages = 3 if capability[1] == 0 else 2
-                default_config = FlexConfig(64, 64, num_stages, 4)
-            else:
-                default_config = FlexConfig(64, 64, 2, 4)
-        else:  # modest hardware or extremely large head_dim
-            default_config = FlexConfig(16, 16, 1, 4)
+            capability_class = "float32"
+        elif major >= 10:
+            capability_class = "sm10x"
+        elif capability == (9, 0):
+            capability_class = "sm90"
+        elif major >= 8:
+            capability_class = "sm8x"
+        else:
+            capability_class = "baseline"
+
+        # fmt: off
+        config_map = {
+            "float32": lambda h: FlexBwDConfig(16, 16, 16, 16, 1, 4),
+            "baseline": lambda h: FlexBwDConfig(16, 16, 16, 16, 1, 4),
+            "sm90": lambda h: (
+                FlexBwDConfig(64, 64, 64, 64, 3, 4) if h < 64 else
+                FlexBwDConfig(64, 128, 128, 64, 3, 8) if h <= 128 else
+                FlexBwDConfig(64, 64, 64, 64, 2, 4)
+            ),
+            "sm10x": lambda h: (
+                FlexBwDConfig(64, 128, 128, 64, 3, 4)
+                if h <= 128
+                else FlexBwDConfig(64, 64, 64, 64, 2, 4)
+            ),
+            "sm8x": lambda h: (
+                FlexBwDConfig(32, 128, 128, 32, 3, 4)
+                if h < 64
+                else FlexBwDConfig(
+                    64, 64, 64, 64, 3 if minor == 6 and h == 128 else 2, 4
+                )
+            ),
+        }
+        # fmt: on
+
+        if head_dim <= 256:
+            default_config = config_map[capability_class](head_dim)
+        else:
+            default_config = FlexBwDConfig(16, 16, 16, 16, 1, 4)
 
         if default_config not in flex_attn_bwd_configs:
             flex_attn_bwd_configs.append(default_config)
@@ -1091,8 +1178,9 @@ def __init__(self) -> None:
             for w in [4, 8]
         ]
 
-        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = [
-            ROCmFlexConfig(BLOCK1, BLOCK2, 1, w, mfma)
+        self.flex_attn_bwd_autotune_configs: list[FlexBwDConfig] = [
+            # See Note: flex bwd configs
+            ROCmFlexBwDConfig(BLOCK1, BLOCK2, BLOCK2, BLOCK1, 1, w, mfma)
             for BLOCK1 in [16, 32, 64]
             for BLOCK2 in [32, 64, 128]
             for w in ([4, 8] if BLOCK1 >= 128 or BLOCK2 >= 128 else [4])
@@ -1119,15 +1207,28 @@ def __init__(self) -> None:
             for wpeu in [0, int(8 // num_warps)]
         ]
 
-        self.exhaustive_flex_attn_bwd_configs: list[FlexConfig] = [
-            ROCmFlexConfig(BLOCK1, BLOCK2, num_stages, num_warps, mfma, wpeu)
-            for BLOCK1 in [16, 32, 64, 128]
-            for BLOCK2 in [16, 32, 64, 128]
+        self.exhaustive_flex_attn_bwd_configs: list[FlexBwDConfig] = [
+            # See Note: flex bwd configs
+            ROCmFlexBwDConfig(
+                BLOCK_M1,
+                BLOCK_N1,
+                BLOCK_M2,
+                BLOCK_N2,
+                num_stages,
+                num_warps,
+                mfma,
+                wpeu,
+            )
+            for BLOCK_M1 in [16, 32, 64, 128]
+            for BLOCK_N1 in [16, 32, 64, 128]
+            for BLOCK_M2 in [16, 32, 64, 128]
+            for BLOCK_N2 in [16, 32, 64, 128]
             for num_stages in [1, 2]
             for num_warps in [2, 4, 8]
             for mfma in [0, 16]
             for wpeu in [0, int(8 // num_warps)]
-            if BLOCK2 % BLOCK1 == 0
+            if BLOCK_N1 % BLOCK_M1 == 0
+            and BLOCK_M2 % BLOCK_N2 == 0  # kernel static assertions
         ]
 
         self.exhaustive_flex_decode_configs: list[FlexDecodeConfig] = [
@@ -1139,6 +1240,13 @@ def __init__(self) -> None:
             for wpeu in [0, int(8 // num_warps)]
         ]
 
+    def _prune_exhaustive_configs(
+        self,
+        configs: list[BaseConfig],
+        dtype_size: int,
+    ) -> list[BaseConfig]:
+        return configs
+
     def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
         """
         ROCm specific filtering
@@ -1239,8 +1347,10 @@ def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
 
         return flex_attn_fwd_configs
 
-    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
-        flex_attn_bwd_configs: list[FlexConfig] = []
+    def get_flex_attn_bwd_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexBwDConfig]:
+        flex_attn_bwd_configs: list[FlexBwDConfig] = []
 
         if config.max_autotune:
             if config.max_autotune_flex_search_space == "EXHAUSTIVE":
@@ -1248,16 +1358,16 @@ def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
             flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
 
         if dtype == torch.float32:
-            default_config = ROCmFlexConfig(16, 16, 1, 4)
+            default_config = ROCmFlexBwDConfig(16, 16, 16, 16, 1, 4)
         elif head_dim <= 256:
             if head_dim == 64:
-                default_config = ROCmFlexConfig(64, 64, 1, 4)
+                default_config = ROCmFlexBwDConfig(64, 64, 64, 64, 1, 4)
             elif head_dim == 128:
-                default_config = ROCmFlexConfig(64, 128, 1, 8)
+                default_config = ROCmFlexBwDConfig(64, 128, 128, 64, 1, 8)
             else:
-                default_config = ROCmFlexConfig(64, 64, 1, 4)
+                default_config = ROCmFlexBwDConfig(64, 64, 64, 64, 1, 4)
         else:
-            default_config = ROCmFlexConfig(16, 16, 1, 4)
+            default_config = ROCmFlexBwDConfig(16, 16, 16, 16, 1, 4)
 
         if default_config not in flex_attn_bwd_configs:
             flex_attn_bwd_configs.append(default_config)
@@ -1289,7 +1399,6 @@ class XPUConfigHeuristic(BaseConfigHeuristic):
 
     def __init__(self) -> None:
         super().__init__()
-
         self.xpu_default_flex_config = {
             (torch.float32, 64): FlexConfig(128, 32, 1, 16),
             (torch.float32, 128): FlexConfig(128, 32, 1, 16),
@@ -1308,12 +1417,13 @@ def __init__(self) -> None:
             FlexConfig(128, 32, 2, 16),
             FlexConfig(128, 32, 2, 8),
         ]
-        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = []
+        self.flex_attn_bwd_autotune_configs: list[FlexBwDConfig] = []
         self.flex_decode_autotune_configs: list[FlexDecodeConfig] = []
 
         if not bool(os.getenv("CI")):
             self.flex_attn_bwd_autotune_configs += [
-                FlexConfig(BLOCK1, BLOCK2, s, w)
+                # See Note: flex bwd configs
+                FlexBwDConfig(BLOCK1, BLOCK2, BLOCK2, BLOCK1, s, w)
                 for BLOCK1 in [32, 64]
                 for BLOCK2 in [32, 64, 128]
                 for s in [1, 3, 4, 5]  # num_stages
@@ -1358,8 +1468,10 @@ def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
 
         return flex_attn_fwd_configs
 
-    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
-        flex_attn_bwd_configs: list[FlexConfig] = []
+    def get_flex_attn_bwd_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexBwDConfig]:
+        flex_attn_bwd_configs: list[FlexBwDConfig] = []
 
         if config.max_autotune:
             if config.max_autotune_flex_search_space == "EXHAUSTIVE":
@@ -1367,16 +1479,16 @@ def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
             flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
 
         if dtype == torch.float32:
-            default_config = FlexConfig(16, 16, 1, 4)
+            default_config = FlexBwDConfig(16, 16, 16, 16, 1, 4)
         elif head_dim <= 256:
             if head_dim == 64:
-                default_config = FlexConfig(64, 64, 1, 8)
+                default_config = FlexBwDConfig(64, 64, 64, 64, 1, 8)
             elif head_dim == 128:
-                default_config = FlexConfig(64, 128, 1, 8)
+                default_config = FlexBwDConfig(64, 128, 64, 128, 1, 8)
             else:
-                default_config = FlexConfig(64, 64, 1, 8)
+                default_config = FlexBwDConfig(64, 64, 64, 64, 1, 8)
         else:  # modest hardware or extremely large head_dim
-            default_config = FlexConfig(16, 16, 1, 4)
+            default_config = FlexBwDConfig(16, 16, 16, 16, 1, 4)
 
         if default_config not in flex_attn_bwd_configs:
             flex_attn_bwd_configs.append(default_config)
@@ -1430,6 +1542,23 @@ class MMTemplateConfigMixin(GemmMaxAutotuneTemplateConfigHeuristics):
     ]
     _filter_configs: Callable[[list[BaseConfig]], list[BaseConfig]]
 
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> dict[str, Any]:
+        assert isinstance(kernel_inputs, MMKernelInputs)
+        m, n, k = kernel_inputs.mnk_symbolic()
+        # Calculate allow_tf32
+        allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
+            not inductor_config.force_same_precision
+            or ((m % 16) == 0 and (n % 16) == 0 and (k % 8) == 0)
+        )
+
+        return {
+            "ALLOW_TF32": allow_tf32,
+        }
+
     def _valid(self, kernel_inputs: KernelInputs) -> bool:
         return True
 
@@ -1503,16 +1632,9 @@ def _convert_config_to_template_kwargs(
             == triton_config.kwargs["BLOCK_K"]
         )
 
-        # Calculate allow_tf32
-        allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
-            not inductor_config.force_same_precision
-            or ((m % 16) == 0 and (n % 16) == 0 and (k % 8) == 0)
-        )
-
         # Build options dict
         options_dict = dict(
             EVEN_K=even_k_symbolic,
-            ALLOW_TF32=allow_tf32,
             USE_FAST_ACCUM=False,  # Option for _scaled_mm
             ACC_TYPE=self._get_acc_type(out_dtype),
             num_stages=triton_config.num_stages,
@@ -1626,6 +1748,7 @@ def _get_template_configs_impl(
             "NUM_SMS": get_num_sms(),
             "TMA_SIZE": TMA_DESCRIPTOR_SIZE,
             "TMA_EXPERIMENTAL_API": not has_triton_stable_tma_api(),
+            "tma_store": config.triton.enable_template_tma_store,
         }
         # Get base template configs from superclass
         for template_kwargs in super()._get_template_configs_impl(
@@ -1635,6 +1758,40 @@ def _get_template_configs_impl(
             yield {**template_kwargs, **tma_opts}
 
 
+# TMA mixins for Blackwell templates
+class BlackwellTMATemplateConfigMixin(TMATemplateConfigMixin):
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Generate TMA template configs by calling super and adding TMA-specific options.
+        """
+        base_ops = {
+            "NUM_SMS": get_num_sms(),
+            # TODO: Consider making this tunable.
+            "FLATTEN": True,
+        }
+        # Get base template configs from superclass
+        for template_kwargs in super()._get_template_configs_impl(
+            kernel_inputs,
+            op_name,
+        ):
+            # Some Triton versions requires num_warps >= 4 for WS
+            # to avoid compilation issues. Triton disables WS if num_warps < 4
+            # or num_stages < 2. Similar issues have been seen with num_stages=1
+            ws = (
+                template_kwargs["num_warps"] >= 4 and template_kwargs["num_stages"] >= 2
+            )
+            yield {
+                **template_kwargs,
+                **base_ops,
+                "WARP_SPECIALIZE": ws,
+                "EPILOGUE_SUBTILE": config.triton.enable_epilogue_subtiling,
+            }
+
+
 # Scaled MM-specific mixin for scaled MM templates
 class BaseScaledMMConfigMixin(MMTemplateConfigMixin):
     """
@@ -1782,6 +1939,15 @@ class ScaledTMAConfigMixin(TMAWorkspaceMixin, BaseScaledMMConfigMixin):
     This inherits from BaseScaledMMConfigMixin and adds TMA-specific options.
     """
 
+    def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
+        """
+        TMA specific filtering:
+        - num_warps=2 not safe for TMA
+        - block_k >= 32 required for TMA (requires inner-most dimension >= 32)
+        """
+        configs = [c for c in configs if c.num_warps != 2 and c.block_k >= 32]
+        return super()._filter_configs(configs)
+
     def _get_template_configs_impl(
         self,
         kernel_inputs: KernelInputs,
@@ -1803,6 +1969,30 @@ def _get_template_configs_impl(
             yield template_kwargs
 
 
+# Scaled Blackwell TMA-specific mixin for scaled MM templates with TMA
+class ScaledBlackwellTMAConfigMixin(
+    BlackwellTMATemplateConfigMixin, ScaledMMConfigMixin
+):
+    """
+    Scaled Blackwell TMA-specific mixin that extends ScaledMMConfigMixin with TMA functionality.
+    This is for scaled MM templates that use device TMA on Blackwell.
+    This inherits from ScaledMMConfigMixin, which inherits the scale_mm_epilogue, and adds TMA-specific options.
+    """
+
+    def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
+        """
+        Warp specialization-specific filtering (BlackwellTMATemplateConfigMixin)
+        (compilation issues occur in some versions of Triton)
+        - num_warps < 4 unsafe for warpspec
+        - num_stages < 2 unsafe for warpspec
+
+        TMA-specific filtering:
+        - block_k >= 32 required for TMA (requires inner-most dimension >= 32)
+        """
+        configs = [c for c in configs if c.block_k >= 32]
+        return super()._filter_configs(configs)
+
+
 # Template-specific heuristic classes using multiple inheritance
 
 
@@ -1863,6 +2053,21 @@ def __init__(self) -> None:
         self.mm_configs = self.persistent_mm_configs
 
 
+@register_template_heuristic(
+    blackwell_ws_persistent_device_tma_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+)
+class CUDABlackwellPersistentTMATemplateConfigHeuristic(
+    BlackwellTMATemplateConfigMixin, CUDAConfigHeuristic
+):
+    """Blackwell Persistent TMA template"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.mm_configs = self.blackwell_persistent_mm_configs
+
+
 @register_template_heuristic(
     persistent_tma_mm_template.uid,
     "cuda",
@@ -1875,6 +2080,22 @@ class CUDAAddmmPersistentTMATemplateConfigHeuristic(
     """Addmm specific mixin for CUDA"""
 
 
+@register_template_heuristic(
+    blackwell_ws_persistent_device_tma_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="addmm",
+)
+class CUDABlackwellAddmmPersistentTMATemplateConfigHeuristic(
+    AddMMConfigMixin, CUDABlackwellPersistentTMATemplateConfigHeuristic
+):
+    """Addmm extension for DataCenter Blackwell Templates"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.mm_configs = self.blackwell_persistent_mm_configs
+
+
 @register_template_heuristic(
     mm_template.uid, "cuda", register=torch.version.hip is None, op_name="scaled_mm"
 )
@@ -1885,11 +2106,10 @@ def __init__(self) -> None:
         super().__init__()
         # Override mm_configs to use scaled_mm_configs
         self.mm_configs = self.scaled_mm_configs
-        # NOTE: overriding exhaustive configs here to be the same as mm_configs
-        # as we haven't validated exhaustive support here yet
-        # TODO(coconutruben): remove this once we have validated exhaustive support
-        # for scaled_mm
-        self.exhaustive_configs = self.scaled_mm_configs
+
+    def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
+        configs = [c for c in configs if c.block_k >= 32]
+        return super()._filter_configs(configs)
 
 
 @register_template_heuristic(
@@ -1904,11 +2124,24 @@ def __init__(self) -> None:
         super().__init__()
         # Override mm_configs to use scaled_persistent_mm_configs for TMA
         self.mm_configs = self.scaled_persistent_mm_configs
-        # NOTE: overriding exhaustive configs here to be the same as mm_configs
-        # as we haven't validated exhaustive support here yet
-        # TODO(coconutruben): remove this once we have validated exhaustive support
-        # for scaled_mm
-        self.exhaustive_configs = self.scaled_persistent_mm_configs
+
+
+@register_template_heuristic(
+    blackwell_ws_persistent_device_tma_mm_template.uid,  # regular Blackwell MM template + scaling epilogue from ScaledMMConfigMixin
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="scaled_mm",
+)
+class CUDAScaledBlackwellTMATemplateConfigHeuristic(
+    ScaledBlackwellTMAConfigMixin, CUDAConfigHeuristic
+):
+    """Scaled Blackwell TMA template heuristic for CUDA"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        # Override mm_configs to use scaled_persistent_mm_configs for TMA
+        # TODO: Tune scaled_persistent_mm_configs for Blackwell
+        self.mm_configs = self.scaled_persistent_mm_configs
 
 
 @register_template_heuristic(
@@ -2127,6 +2360,12 @@ def __init__(self) -> None:
 class XPUMMTemplateConfigHeuristic(MMTemplateConfigMixin, XPUConfigHeuristic):
     """Standard MM template heuristic for XPU"""
 
+    def __init__(self) -> None:
+        super().__init__()
+
+        # TODO(etaf): Design proper exhaustive search space for XPU.
+        self.exhaustive_configs = self.mm_configs
+
 
 @register_template_heuristic(mm_template.uid, "xpu", op_name="addmm")
 @register_template_heuristic(bmm_template.uid, "xpu", op_name="baddbmm")
diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py
index 4a1febe08e993..ea7d61cf9315c 100644
--- a/torch/_inductor/tiling_utils.py
+++ b/torch/_inductor/tiling_utils.py
@@ -1,9 +1,6 @@
 import dataclasses
-import functools
 import itertools
-import sys
 from collections import Counter, defaultdict
-from collections.abc import Iterable, Iterator
 from typing import Callable, Literal, Optional, overload, TYPE_CHECKING, TypeVar, Union
 
 import sympy
@@ -373,20 +370,6 @@ def try_split(self, pw: Split, red: Split) -> Optional[tuple[Split, Split]]:
         return pw, red
 
 
-if sys.version_info >= (3, 10):
-    # On Python 3.10+ we can use zip(strict=True)
-    zip_equal = functools.partial(zip, strict=True)
-else:
-    # Fallback for older versions
-    def zip_equal(it1: Iterable[T], it2: Iterable[U]) -> Iterator[tuple[T, U]]:
-        """
-        Zip two iterables, raising ValueError if their lengths differ.
-        """
-        if len(it1) != len(it2):
-            raise ValueError(f"Lengths differ: {len(it1)} != {len(it2)}")
-        return zip(it1, it2)
-
-
 def apply_var_mapping(
     iter_vars: list[sympy.Symbol],
     red_vars: list[sympy.Symbol],
@@ -424,7 +407,7 @@ def apply_var_mapping(
 
     iter_vars_to_flat_vars = {}
     for i, (group, var_group) in enumerate(
-        zip_equal(apply_groups, ((iter_vars, red_vars)))
+        zip(apply_groups, (iter_vars, red_vars), strict=True)
     ):
         # if the node has sizes (p0, 1) and the fused node is (p0, r0)
         # the reduction var gets filled in for split_iteration_range
@@ -437,7 +420,9 @@ def apply_var_mapping(
 
     count = 0
     flat_vars_to_new_vars = {}
-    for new_range, new_var in zip_equal(new_ranges, norm_pw_vars + norm_red_vars):
+    for new_range, new_var in zip(
+        new_ranges, norm_pw_vars + norm_red_vars, strict=True
+    ):
         range_vars = []
         for i in range(len(new_range)):
             range_vars.append(flat_vars[count])
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index abb850ea4cce4..f64213c263838 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -90,6 +90,7 @@
 
     from .codegen.common import WorkspaceArg
     from .codegen.wrapper import PythonWrapperCodegen
+    from .dependencies import Dep
     from .graph import GraphLowering
     from .ir import Buffer, ExternKernel, IRNode, Layout, Operation, ReinterpretView
     from .output_code import CompiledFxGraph
@@ -665,6 +666,13 @@ def clear_cache(self: Any) -> None:
     return wrapper  # type: ignore[return-value]
 
 
+def cache_property_on_self(fn: Callable[P, RV]) -> CachedMethod[P, RV]:
+    """
+    Variant of cache_on_self for properties. The only difference is the type signature.
+    """
+    return cache_on_self(fn)
+
+
 def aggregate_origins(
     node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernel],
 ) -> OrderedSet[Node]:
@@ -1459,6 +1467,9 @@ def __add__(self, other: Self) -> IndentedBuffer:
         res.writelines(other._lines)
         return res
 
+    def contains(self, new_line: Union[DeferredLineBase, LineContext, str]) -> bool:
+        return new_line in self._lines
+
 
 class FakeIndentedBuffer(IndentedBuffer):
     def __init__(self) -> None:
@@ -1663,7 +1674,9 @@ def use_triton_template(
     )
 
 
-def can_use_tma(*matrices: IRNode, add_guards: bool = False) -> bool:
+def can_use_tma(
+    *matrices: IRNode, output_layout: Optional[Layout] = None, add_guards: bool = False
+) -> bool:
     """
     Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
     that Triton relies on today.
@@ -1685,11 +1698,37 @@ def can_use_tma(*matrices: IRNode, add_guards: bool = False) -> bool:
     def _aligned(expr_bytes: Union[int, sympy.Expr]) -> bool:
         return V.graph.sizevars.statically_known_multiple_of(expr_bytes, TMA_ALIGNMENT)
 
-    def _is_tma_compatible_default(x: IRNode) -> bool:
-        sizes = x.get_size()
-        strides = x.get_stride()
+    def _is_tma_compatible_layout(layout: Optional[Layout]) -> bool:
+        if layout is None:
+            return True
+        sizes = layout.size
+        strides = layout.stride
+        dtype = layout.dtype
+
+        # Verify the output is 16-byte aligned
+        if not _aligned(layout.offset):
+            return False
+
+        return _is_tma_compatible(sizes, strides, dtype, allow_float32=True)
+
+    def _is_tma_compatible_matrix(m: IRNode) -> bool:
+        sizes = m.get_size()
+        strides = m.get_stride()
+        dtype = m.get_dtype()
+
+        # Base pointer 16-byte aligned
+        if m.get_name() in V.graph.unaligned_buffers:
+            return False
+
+        return _is_tma_compatible(sizes, strides, dtype, allow_float32=False)
+
+    def _is_tma_compatible(
+        sizes: Sequence[sympy.Expr],
+        strides: Sequence[_IntLike],
+        dtype: torch.dtype,
+        allow_float32: bool,
+    ) -> bool:
         rank = len(sizes)
-        dtype = x.get_dtype()
         itemsize = dtype.itemsize
 
         # 2 ≤ rank ≤ 5
@@ -1697,11 +1736,9 @@ def _is_tma_compatible_default(x: IRNode) -> bool:
             return False
 
         # dtype ∈ {FP16, BF16, FP8-E4M3FN}
-        if dtype not in (torch.float16, torch.bfloat16, torch.float8_e4m3fn):
-            return False
-
-        # Base pointer 16-byte aligned
-        if x.get_name() in V.graph.unaligned_buffers:
+        if dtype not in (torch.float16, torch.bfloat16, torch.float8_e4m3fn) and (
+            not allow_float32 or dtype != torch.float32
+        ):
             return False
 
         if add_guards:
@@ -1745,35 +1782,40 @@ def _is_tma_compatible_default(x: IRNode) -> bool:
 
         return True
 
-    def _is_tma_compatible_xpu(x: IRNode) -> bool:
-        strides = x.get_stride()
-        strides_i = [V.graph.sizevars.symbolic_hint(st) for st in strides]
-        # Find the single contiguous (“inner”) dim
-        inner = [
-            i
-            for i, st in enumerate(strides_i)
-            if V.graph.sizevars.statically_known_equals(st, 1)
-        ]
-        if len(inner) != 1:
-            return False
-        return True
-
-    return has_triton_tma_device() and all(
-        _is_tma_compatible_default(m)
-        if (m_device := m.get_device()) is None or m_device.type != "xpu"
-        else _is_tma_compatible_xpu(m)
-        for m in matrices
+    return (
+        has_triton_tma_device()
+        and all(_is_tma_compatible_matrix(m) for m in matrices)
+        and _is_tma_compatible_layout(output_layout)
     )
 
 
-def use_triton_tma_template(*matrices: IRNode, add_guards: bool = False) -> bool:
+def use_triton_tma_template(
+    *matrices: IRNode, output_layout: Layout, add_guards: bool = False
+) -> bool:
+    layout = output_layout if config.triton.enable_template_tma_store else None
     return (
         all(len(m.get_size()) == 2 for m in matrices)
-        and can_use_tma(*matrices, add_guards=add_guards)
+        and can_use_tma(*matrices, output_layout=layout, add_guards=add_guards)
         and config.triton.enable_persistent_tma_matmul
     )
 
 
+def use_triton_blackwell_tma_template(
+    *matrices: IRNode, output_layout: Layout, add_guards: bool = False
+) -> bool:
+    if not use_triton_tma_template(
+        *matrices, output_layout=output_layout, add_guards=add_guards
+    ):
+        return False
+
+    from torch.utils._triton import has_triton_tensor_descriptor_host_tma
+
+    from .codegen.cuda.cuda_env import is_datacenter_blackwell_arch
+
+    # Blackwell template require the tensor descriptor API, not the experimental API.
+    return has_triton_tensor_descriptor_host_tma() and is_datacenter_blackwell_arch()
+
+
 def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
     from .virtualized import V
 
@@ -1985,16 +2027,7 @@ def use_ck_template(layout: Layout) -> bool:
         log.warning("Please pip install Composable Kernel package")
         return False
 
-    if config.is_fbcode():
-        config.rocm.ck_dir = ck_package_dirname
-
-    if not config.rocm.ck_dir:
-        log.warning("Please set TORCHINDUCTOR_CK_DIR env variable")
-        return False
-
-    if ck_package_dirname != config.rocm.ck_dir:
-        log.warning("Invalid path to CK library")
-        return False
+    config.rocm.ck_dir = ck_package_dirname
 
     return True
 
@@ -2940,6 +2973,15 @@ def shape_env_from_inputs(inputs: Sequence[InputType]) -> Optional[ShapeEnv]:
         if isinstance(input, torch.SymInt):
             return input.node.shape_env
 
+        # Check tensor sizes and strides for SymInt values
+        if isinstance(input, torch.Tensor):
+            for size in input.size():
+                if isinstance(size, torch.SymInt):
+                    return size.node.shape_env
+            for stride in input.stride():
+                if isinstance(stride, torch.SymInt):
+                    return stride.node.shape_env
+
     # TODO(voz): Should we always have one anyway?
     return None
 
@@ -3293,12 +3335,7 @@ def __delitem__(self, key: KeyType) -> None:
 @dataclass_transform(frozen_default=True)
 def ir_dataclass(cls: Optional[type[Any]] = None, /, *, frozen: bool = True) -> Any:
     def wrap(cls: _T) -> _T:
-        if sys.version_info >= (3, 10):
-            return dataclasses.dataclass(cls, kw_only=True, frozen=frozen)  # type: ignore[call-overload]
-        else:
-            # Polyfill for python=3.9. kw_only simply introduces an extra check
-            # that only kwargs are used (and is not available on 3.9)
-            return dataclasses.dataclass(cls, frozen=frozen)
+        return dataclasses.dataclass(cls, kw_only=True, frozen=frozen)  # type: ignore[call-overload]
 
     if cls is None:
         return wrap
@@ -3702,3 +3739,16 @@ def to_real_tensor(e: Any) -> Any:
     flat_args = [to_real_tensor(a) for a in flat_args]
     args, kwargs = pytree.tree_unflatten(flat_args, flat_args_pytree_spec)
     return args, kwargs
+
+
+def is_nonfreeable_buffers(dep: Dep) -> bool:
+    from .virtualized import V
+
+    dep_name = dep.name
+    # Subgraphs have a prefix for the name, cleanup the prefix
+    # before checking for known strings.
+    if V.graph.name:
+        dep_name = dep_name.removeprefix(V.graph.name + "_")
+    return dep_name.startswith(
+        ("primals_", "arg", "fwd_rng_state", "bwd_rng_state", "tangents")
+    )
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index be6d23bbbc53c..8d42421d8b90d 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -52,15 +52,7 @@
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
 
-IS_PY310_PLUS: Final[bool] = sys.version_info >= (3, 10)
-
-BuiltinUnionType: Union[type, tuple[type, ...]]
-if sys.version_info >= (3, 10):
-    # NOTE: IS_PY310_PLUS doesn't work with mypy.
-    # cf. https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks
-    BuiltinUnionType = types.UnionType
-else:
-    BuiltinUnionType = ()  # trick: this makes isinstance short circuit.
+BuiltinUnionType: Union[type, tuple[type, ...]] = types.UnionType
 
 LockType: type
 try:
@@ -1255,14 +1247,10 @@ def _get_named_tuple_properties(
         ]
     else:
         defaults = []
-    # In 3.10 recommended way to get annotations is to call `inspect.get_annotations` function
-    # Also, annotations from base class are not inherited so they need to be queried explicitly
-    if sys.version_info[:2] < (3, 10):
-        obj_annotations = getattr(obj, "__annotations__", {})
-    else:
-        obj_annotations = inspect.get_annotations(obj)
-        if len(obj_annotations) == 0 and hasattr(obj, "__base__"):
-            obj_annotations = inspect.get_annotations(obj.__base__)
+
+    obj_annotations = inspect.get_annotations(obj)
+    if len(obj_annotations) == 0 and hasattr(obj, "__base__"):
+        obj_annotations = inspect.get_annotations(obj.__base__)
 
     annotations = []
     for field in obj._fields:
diff --git a/torch/_library/custom_ops.py b/torch/_library/custom_ops.py
index 251cdefe0f05d..596eec5174ca1 100644
--- a/torch/_library/custom_ops.py
+++ b/torch/_library/custom_ops.py
@@ -5,7 +5,7 @@
 import weakref
 from collections.abc import Iterable, Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, Literal, Optional, overload, Union
+from typing import Any, Callable, Optional, overload, Union
 
 import torch
 from torch import _C, _ops, Tensor
@@ -22,12 +22,13 @@
 @overload
 def custom_op(
     name: str,
-    fn: Literal[None] = None,
+    fn: None = None,
     /,
     *,
     mutates_args: Union[str, Iterable[str]],
     device_types: device_types_t = None,
     schema: Optional[str] = None,
+    tags: Optional[Sequence[_C.Tag]] = None,
 ) -> Callable[[Callable[..., object]], "CustomOpDef"]: ...
 
 
@@ -40,6 +41,7 @@ def custom_op(
     mutates_args: Union[str, Iterable[str]],
     device_types: device_types_t = None,
     schema: Optional[str] = None,
+    tags: Optional[Sequence[_C.Tag]] = None,
 ) -> "CustomOpDef": ...
 
 
diff --git a/torch/_library/opaque_object.py b/torch/_library/opaque_object.py
new file mode 100644
index 0000000000000..7b6abfa8060ce
--- /dev/null
+++ b/torch/_library/opaque_object.py
@@ -0,0 +1,116 @@
+from typing import Any
+
+import torch
+
+
+OPAQUE_OBJ_TYPE = "__torch__.torch.classes.aten.OpaqueObject"
+
+
+def make_opaque(payload: Any = None) -> torch._C.ScriptObject:
+    """
+    Creates an opaque object which stores the given Python object.
+    This opaque object can be passed to any custom operator as an argument.
+    The Python object can then be accessed from the opaque object using the `get_payload()` API.
+    The opaque object has `._type()`
+    "__torch__.torch.classes.aten.OpaqueObject", which should be the type used
+    when creating custom operator schemas.
+
+    Args:
+        payload (Any): The Python object to store in the opaque object. This can
+        be empty, and can be set with `set_payload()` later.
+
+    Returns:
+        torch._C.ScriptObject: The opaque object that stores the given Python object.
+
+    Example:
+
+        >>> import random
+        >>> import torch
+        >>> from torch._library.opaque_object import (
+        ...     make_opaque,
+        ...     get_payload,
+        ...     set_payload,
+        ... )
+        >>>
+        >>> class RNGState:
+        >>>     def __init__(self, seed):
+        >>>         self.rng = random.Random(seed)
+        >>>
+        >>> rng = RNGState(0)
+        >>> obj = make_opaque()
+        >>> set_payload(obj, rng)
+        >>>
+        >>> assert get_payload(obj) == rng
+        >>>
+        >>> lib = torch.library.Library("mylib", "FRAGMENT")
+        >>>
+        >>> torch.library.define(
+        >>>     "mylib::noisy_inject",
+        >>>     "(Tensor x, __torch__.torch.classes.aten.OpaqueObject obj) -> Tensor",
+        >>>     tags=torch.Tag.pt2_compliant_tag,
+        >>>     lib=lib,
+        >>> )
+        >>>
+        >>> @torch.library.impl(
+        >>>     "mylib::noisy_inject", "CompositeExplicitAutograd", lib=lib
+        >>> )
+        >>> def noisy_inject(x: torch.Tensor, obj: torch._C.ScriptObject) -> torch.Tensor:
+        >>>     rng_state = get_payload(obj)
+        >>>     assert isinstance(rng_state, RNGState)
+        >>>     out = x.clone()
+        >>>     for i in range(out.numel()):
+        >>>         out.view(-1)[i] += rng_state.rng.random()
+        >>>     return out
+        >>>
+        >>> print(torch.ops.mylib.noisy_inject(torch.ones(3), obj))
+    """
+    return torch._C._make_opaque_object(payload)
+
+
+def get_payload(opaque_object: torch._C.ScriptObject) -> Any:
+    """
+    Retrieves the Python object stored in the given opaque object.
+
+    Args:
+        torch._C.ScriptObject: The opaque object that stores the given Python object.
+
+    Returns:
+        payload (Any): The Python object stored in the opaque object. This can
+        be set with `set_payload()`.
+    """
+    if not (
+        isinstance(opaque_object, torch._C.ScriptObject)
+        and opaque_object._type().qualified_name() == OPAQUE_OBJ_TYPE
+    ):
+        type_ = (
+            opaque_object._type().qualified_name()
+            if isinstance(opaque_object, torch._C.ScriptObject)
+            else type(opaque_object)
+        )
+        raise ValueError(
+            f"Tried to get the payload from a non-OpaqueObject of type `{type_}`"
+        )
+    return torch._C._get_opaque_object_payload(opaque_object)
+
+
+def set_payload(opaque_object: torch._C.ScriptObject, payload: Any) -> None:
+    """
+    Sets the Python object stored in the given opaque object.
+
+    Args:
+        torch._C.ScriptObject: The opaque object that stores the given Python object.
+        payload (Any): The Python object to store in the opaque object.
+    """
+    if not (
+        isinstance(opaque_object, torch._C.ScriptObject)
+        and opaque_object._type().qualified_name() == OPAQUE_OBJ_TYPE
+    ):
+        type_ = (
+            opaque_object._type().qualified_name()
+            if isinstance(opaque_object, torch._C.ScriptObject)
+            else type(opaque_object)
+        )
+        raise ValueError(
+            f"Tried to get the payload from a non-OpaqueObject of type `{type_}`"
+        )
+    torch._C._set_opaque_object_payload(opaque_object, payload)
diff --git a/torch/_library/simple_registry.py b/torch/_library/simple_registry.py
index bf25cde9cb531..1f11914e8e9ac 100644
--- a/torch/_library/simple_registry.py
+++ b/torch/_library/simple_registry.py
@@ -1,5 +1,4 @@
-# mypy: allow-untyped-defs
-from typing import Callable, Optional
+from typing import Any, Callable, Optional
 
 from .fake_impl import FakeImplHolder
 from .utils import RegistrationHandle
@@ -24,8 +23,8 @@ class SimpleLibraryRegistry:
     (including the overload) to SimpleOperatorEntry.
     """
 
-    def __init__(self):
-        self._data = {}
+    def __init__(self) -> None:
+        self._data: dict[str, SimpleOperatorEntry] = {}
 
     def find(self, qualname: str) -> "SimpleOperatorEntry":
         res = self._data.get(qualname, None)
@@ -44,7 +43,7 @@ class SimpleOperatorEntry:
     registered to.
     """
 
-    def __init__(self, qualname: str):
+    def __init__(self, qualname: str) -> None:
         self.qualname: str = qualname
         self.fake_impl: FakeImplHolder = FakeImplHolder(qualname)
         self.torch_dispatch_rules: GenericTorchDispatchRuleHolder = (
@@ -53,17 +52,17 @@ def __init__(self, qualname: str):
 
     # For compatibility reasons. We can delete this soon.
     @property
-    def abstract_impl(self):
+    def abstract_impl(self) -> FakeImplHolder:
         return self.fake_impl
 
 
 class GenericTorchDispatchRuleHolder:
-    def __init__(self, qualname):
-        self._data = {}
-        self.qualname = qualname
+    def __init__(self, qualname: str) -> None:
+        self._data: dict[type, Callable[..., Any]] = {}
+        self.qualname: str = qualname
 
     def register(
-        self, torch_dispatch_class: type, func: Callable
+        self, torch_dispatch_class: type, func: Callable[..., Any]
     ) -> RegistrationHandle:
         if self.find(torch_dispatch_class):
             raise RuntimeError(
@@ -71,16 +70,18 @@ def register(
             )
         self._data[torch_dispatch_class] = func
 
-        def deregister():
+        def deregister() -> None:
             del self._data[torch_dispatch_class]
 
         return RegistrationHandle(deregister)
 
-    def find(self, torch_dispatch_class):
+    def find(self, torch_dispatch_class: type) -> Optional[Callable[..., Any]]:
         return self._data.get(torch_dispatch_class, None)
 
 
-def find_torch_dispatch_rule(op, torch_dispatch_class: type) -> Optional[Callable]:
+def find_torch_dispatch_rule(
+    op: Any, torch_dispatch_class: type
+) -> Optional[Callable[..., Any]]:
     return singleton.find(op.__qualname__).torch_dispatch_rules.find(
         torch_dispatch_class
     )
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 7a0301371b119..8435b80f34c9b 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -646,16 +646,14 @@ def meta__cslt_sparse_mm(
     assert len(dense_B.shape) == 2, "_cslt_sparse_mm only supports 2d inputs"
 
     is_8bit_input_type = compressed_A.dtype in [torch.int8, torch.float8_e4m3fn]
-    compression_factor = 10 if is_8bit_input_type else 9
 
     if is_8bit_input_type:
         assert not dense_B.is_contiguous(), (
             "dense input must be transposed for 8bit dtypes"
         )
 
-    k = dense_B.size(0)
     n = dense_B.size(1)
-    m = (compressed_A.numel() * 16) // (compression_factor * k)
+    m = compressed_A.size(0)
     if bias is not None:
         assert m == bias.size(0)
 
@@ -2240,7 +2238,7 @@ def meta__fused_moving_avg_obs_fq_helper(
 
 @register_meta(aten.mm)
 @out_wrapper(exact_dtype=True)
-def meta_mm(a, b):
+def meta_mm(a, b, out_dtype: Optional[torch.dtype] = None):
     torch._check(a.dim() == 2, lambda: "a must be 2D")
     torch._check(b.dim() == 2, lambda: "b must be 2D")
     N, M1 = a.shape
@@ -2249,7 +2247,17 @@ def meta_mm(a, b):
         M1 == M2,
         lambda: f"a and b must have same reduction dim, but got [{N}, {M1}] X [{M2}, {P}].",
     )
-    return a.new_empty(N, P)
+    if out_dtype is not None:
+        torch._check(
+            out_dtype == a.dtype
+            or (
+                out_dtype == torch.float32
+                and a.dtype in (torch.float16, torch.bfloat16)
+            ),
+            lambda: "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs",
+        )
+    result_dtype = a.dtype if out_dtype is None else out_dtype
+    return a.new_empty((N, P), dtype=result_dtype)
 
 
 def _compute_reduction_shape(self, dims, keepdim):
@@ -3465,7 +3473,7 @@ def _restride_src(self):
     # Note that perm here is the reverse of the 'perm_' decided by
     # TensorIteratorBase::reorder_dimensions
     restrided_self = _restride_src(self)
-    perm = utils.compute_elementwise_output_logical_to_physical_perm(restrided_self)
+    perm, _ = utils.compute_elementwise_output_logical_to_physical_perm(restrided_self)
 
     # Follow TensorIteratorBase::allocate_or_resize_outputs
     if list(perm) != list(range(len(perm))):
@@ -5710,7 +5718,7 @@ def meta__scaled_dot_product_cudnn_attention(
     res = alloc_with_matching_layout(query, res_shape)
 
     logsum_exp = torch.empty(
-        (B, H, S_Q),
+        (B, H, S_Q, 1),
         dtype=torch.float,
         device=query.device,
     )
@@ -7547,18 +7555,18 @@ def check_scale(scale_name, scale, mat, scaled_dim, scale_multiplier=1):
                 # scale sizes at compile time.
                 if is_mxfp8:
                     torch._check(
-                        mat.ndim == scale.ndim,
-                        lambda: f"For MXFP8, scale should have same number of dimensions as target tensor, but {scale_name} has mat.ndim={mat.ndim} and scale.ndim={scale.ndim}",  # noqa: B950
+                        scale.ndim == mat.ndim - 1,
+                        lambda: f"For MXFP8, 3d tensor should have 2d scales, but {scale_name} has mat.ndim={mat.ndim} and scale.ndim={scale.ndim}",  # noqa: B950
                     )
                     # TODO: This logic only holds for RHS tensor in 2d-3d case.
                     # We'll need to update it to handle LHS 3d tensor in 3d-2d and 3d-3d cases.
-                    G, K, N = scale.shape
+                    G, K, N = mat.shape
                     block_size = 32
                     blocked_K = round_up(K / block_size, 4)
                     blocked_N = round_up(N, 128)
                     torch._check(
-                        mat.shape[-2] == blocked_K and mat.shape[-1] == blocked_N,
-                        lambda: f"For MXFP8, expected mat.shape={mat.shape} to have scale shape of ({G},{blocked_K},{blocked_N}), but got {scale.shape}",  # noqa: B950
+                        scale.shape[0] == G and scale.shape[1] == blocked_K * blocked_N,
+                        lambda: f"For MXFP8, expected mat.shape={mat.shape} to have scale shape of ({G},{blocked_K * blocked_N}), but got {scale.shape}",  # noqa: B950
                     )
                 else:
                     torch._check(
diff --git a/torch/_ops.py b/torch/_ops.py
index b351aa17dfa76..c4584256b3a14 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -931,7 +931,7 @@ def handler(*args: _P.args, **kwargs: _P.kwargs) -> _T:
                         return self._op_dk(key, *args, **kwargs)
 
                 with torch.utils._python_dispatch._pop_mode_temporarily() as mode:
-                    return self.python_key_table[curr_mode](mode, *args, **kwargs)
+                    return self.python_key_table[curr_mode](mode, *args, **kwargs)  # type: ignore[index]
 
             self._dispatch_cache[key] = handler
             add_cached_op(self)
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index bb26bbb508bd6..034263ea48498 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -404,7 +404,7 @@ def _prim_elementwise_meta(
     utils.check_same_device(*args_, allow_cpu_scalar_tensors=True)
     utils.check_same_shape(*args_, allow_cpu_scalar_tensors=True)
 
-    l2p_perm = utils.compute_elementwise_output_logical_to_physical_perm(*args_)
+    l2p_perm, _ = utils.compute_elementwise_output_logical_to_physical_perm(*args_)
     shape = utils.extract_shape(*args_, allow_cpu_scalar_tensors=True)
 
     # Acquires the dtype
@@ -692,16 +692,22 @@ def _clone_meta(
             device=input.device,
             memory_format=memory_format,
         )
+    else:
+        # Match eager behavior by preserving strides for non_overlapping_and_dense tensors
+        # If not, eager clone creates contiguous strides
+        computed_stride = None
+        if utils.is_non_overlapping_and_dense(input):
+            computed_stride = input.stride()
+        else:
+            computed_stride = utils.compute_elementwise_output_strides(input)
 
-    # memory_format == torch.preserve_format
-    strides = utils.compute_elementwise_output_strides(input)
-    return torch.empty_strided(
-        input.shape,
-        strides,
-        dtype=input.dtype,
-        layout=input.layout,
-        device=input.device,
-    )
+        return torch.empty_strided(
+            input.shape,
+            computed_stride,
+            dtype=input.dtype,
+            layout=input.layout,
+            device=input.device,
+        )
 
 
 clone = _make_prim(
@@ -1384,12 +1390,22 @@ def _collapsed_shape(shape: ShapeType, start: int, end: int) -> tuple[int, ...]:
     return shape[0:start] + (dim_length,) + shape[end + 1 :]
 
 
+# If the collapse is invalid or cannot be determined (because of unbacked data)
+# then `must_be_valid` determines the behavior:
+#   None: return None, None.
+#   str: Do a torch._check() to ensure the collapse is valid and if it isn't
+#   then fail with the provided string.
 def _collapse_view_helper(
-    a: TensorLikeType, start: int, end: int
+    a: TensorLikeType, start: int, end: int, must_be_valid: Optional[str]
 ) -> tuple[Optional[ShapeType], Optional[StrideType]]:
     assert isinstance(a, TensorLike)
 
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        sym_and,
+        sym_or,
+    )
 
     _validate_collapse_args(a, start, end)
 
@@ -1404,52 +1420,69 @@ def _collapse_view_helper(
     if a.ndim == 0 or (end == start):
         return shape, strides
 
-    length = shape[end]
+    valid_op = True
+    if guard_or_false(a.numel() != 0):
+        for idx in range(end - 1, start - 1, -1):
+            valid_op = sym_and(
+                valid_op,
+                sym_or(
+                    shape[idx] == 1,
+                    shape[idx + 1] == 1,
+                    strides[idx] == strides[idx + 1] * shape[idx + 1],
+                ),
+            )  # type: ignore[assignment]
+
+            # early exit if we already know its invalid.
+            if guard_or_false(valid_op is False):
+                break
+
+    # for unbacked this become a runtime assertion.
+    valid_op = sym_or(valid_op, a.numel() == 0)
+
+    if must_be_valid:
+        torch._check(valid_op, lambda: must_be_valid)
+    else:
+        if not guard_or_false(valid_op):
+            return None, None
+
+    # compute stride
     stride = strides[end]
     for idx in range(end - 1, start - 1, -1):
-        if guard_size_oblivious(shape[idx] == 0) or guard_size_oblivious(
-            shape[idx + 1] == 0
-        ):
-            length = 0
-            stride = 0
-            break
-
-        if guard_size_oblivious(shape[idx] == 1):
-            continue
+        if shape[idx] != 1:
+            # TODO with unbacked we should really exclude when shape[idx] == 1
+            # something like
+            # min(stride[end], torch.ite(shape[x]!=1,stride[idx], inf), ...)
+            stride = min(stride, strides[idx])
 
-        length = length * shape[idx]
-        if guard_size_oblivious(stride < strides[idx]):
-            stride = stride
-        else:
-            stride = strides[idx]
-
-        if (
-            guard_size_oblivious(a.numel() > 0)
-            and guard_size_oblivious(shape[idx + 1] != 1)
-            and not guard_size_oblivious(
-                strides[idx] == strides[idx + 1] * shape[idx + 1]
-            )
-        ):
-            return None, None
+    # compute length
+    length = shape[end]
+    if guard_or_true(length != 0):
+        for idx in range(end - 1, start - 1, -1):
+            if guard_or_false(shape[idx] == 0):
+                length = 0
+                stride = 0
+                break
+            length = length * shape[idx]
+    else:
+        stride = 0
 
     new_shape = shape[:start] + (length,) + shape[end + 1 :]
     new_strides = strides[:start] + (stride,) + strides[end + 1 :]
 
     # NOTE: when the input has no elements it's restrided as if it were contiguous
-    if guard_size_oblivious(a.numel() == 0):
+    # except for unbacked.
+    if guard_or_false(a.numel() == 0):
         new_strides = utils.make_contiguous_strides_for(new_shape)
 
     return new_shape, new_strides
 
 
 def _collapse_view_meta(a: TensorLikeType, start: int, end: int) -> TensorLikeType:
-    new_shape, new_strides = _collapse_view_helper(a, start, end)
-
-    if new_shape is None:
-        msg = "Attempting to view a collapsed tensor, but no such view exists!"
-        raise ValueError(msg)
-
+    new_shape, new_strides = _collapse_view_helper(
+        a, start, end, "Attempting to view a collapsed tensor, but no such view exists!"
+    )
     assert new_strides is not None
+    assert new_shape is not None
     return a.as_strided(new_shape, new_strides, a.storage_offset())
 
 
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 91b0cc1f68d47..b4c3afce557b3 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -534,12 +534,9 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
 # This is also INCORRECT because it does not model TensorIterator's
 # short-circuit, which can cause different strides.
 def compute_elementwise_output_logical_to_physical_perm(
-    *tensors, _skip_checks=False
-) -> list[int]:
-    from torch.fx.experimental.symbolic_shapes import (
-        guard_or_false,
-        guard_size_oblivious,
-    )
+    *tensors, _skip_checks=False, ambiguity_check=False
+) -> tuple[list[int], bool]:
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
 
     if not _skip_checks and len(tensors) == 0:
         msg = "Can't compute elementwise output strides for zero tensors!"
@@ -558,15 +555,15 @@ def compute_elementwise_output_logical_to_physical_perm(
 
     # Short-circuits for CPU scalar case
     if len(tensors) == 0:
-        return []
+        return [], False
 
     # Short-circuits for shapes with zero or one dimensions
     # TODO: are these necessary?
     ndim = tensors[0].ndim
     if ndim == 0:
-        return []
+        return [], False
     if ndim == 1:
-        return [0]
+        return [0], False
 
     # Short-circuits if contiguous or channels last, following the fake fast path.
     # This reduces the number of guards we end up making
@@ -584,42 +581,40 @@ def compute_elementwise_output_logical_to_physical_perm(
         )
 
     if is_contiguous and not is_channels_last:
-        return list(range(ndim))
+        return list(range(ndim)), False
 
     if is_channels_last and not is_contiguous:
-        return [0, *list(range(2, ndim)), 1]
+        return [0, *list(range(2, ndim)), 1], False
 
     shape = tensors[0].shape
 
     def should_swap(idx_a, idx_b):
+        def ge(a, b):
+            """
+            Returns true if a is symbolically greater than or equal to b, assuming a >= 0, b >= 0.
+            """
+            if guard_or_false(b == 0):
+                return True
+            elif guard_or_false(a == 0):
+                return False
+            return guard_or_false(a >= b) or guard_or_false(a % b == 0)
+
         for tensor in tensors:
             stride_a = tensor.stride()[idx_a]
             stride_b = tensor.stride()[idx_b]
-            if guard_size_oblivious(stride_a == 0) or guard_size_oblivious(
-                stride_b == 0
-            ):
+
+            if guard_or_false(stride_a == 0) or guard_or_false(stride_b == 0):
                 continue
 
             if guard_or_false(stride_a == stride_b):
-                if guard_size_oblivious(shape[idx_a] > shape[idx_b]):
-                    return 1
-
-            # when stride_a = 1, we want stride_a < stride_b to be TRUE
-            # when stride_b = 1, we want stride_a < stride_b to be FALSE
-            elif guard_or_false(stride_a == 1):
-                return -1
-
-            elif guard_or_false(stride_b == 1):
+                if ge(shape[idx_b], shape[idx_a]):
+                    continue
                 return 1
 
-            if guard_size_oblivious(stride_a < stride_b):
+            if ge(stride_b, stride_a):
                 return -1
 
-            if guard_size_oblivious(stride_a > stride_b):
-                return 1
-
-            # stride_a == stride_b
-            if guard_size_oblivious(shape[idx_a] > shape[idx_b]):
+            if ge(stride_a, stride_b):
                 return 1
 
         # Note: this case is hit if all strides are zero,
@@ -644,7 +639,16 @@ def should_swap(idx_a, idx_b):
             elif comparison < 0:
                 break
 
-    return list(reversed(perm))
+    # verify we've imposed ordering if ambiguity_check=True
+    raise_ambiguous = False
+    if ambiguity_check:
+        for i, j in zip(range(ndim - 1), range(1, ndim)):
+            order = should_swap(perm[i], perm[j])
+            if order != -1:
+                raise_ambiguous = True
+                break
+
+    return list(reversed(perm)), raise_ambiguous
 
 
 def compute_elementwise_output_strides(*tensors) -> tuple[int, ...]:
@@ -674,7 +678,7 @@ def compute_elementwise_output_strides(*tensors) -> tuple[int, ...]:
     if ndim == 1:
         return (1,)
 
-    logical_to_physical_perm = compute_elementwise_output_logical_to_physical_perm(
+    logical_to_physical_perm, _ = compute_elementwise_output_logical_to_physical_perm(
         *tensors, _skip_checks=True
     )
     permuted_shape = apply_perm(shape, logical_to_physical_perm)  # to physical
@@ -1971,10 +1975,15 @@ def check(
 
 # This combines is_channels_last_strides_2d and is_channels_last_strides_3d in
 # c10/core/MemoryFormat.h into one function
-def are_strides_like_channels_last(
+# May return False when input sizes are data-dependent and the property is not
+# determined.
+def are_strides_like_channels_last_or_false(
     shape: Sequence[int], strides: Sequence[int]
 ) -> bool:
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_true,
+        statically_known_true,
+    )
 
     ndim = len(shape)
 
@@ -1987,19 +1996,22 @@ def are_strides_like_channels_last(
     else:
         return False
 
-    if guard_size_oblivious(strides[1] == 0):
+    if guard_or_true(strides[1] == 0):
         return False
 
     min = 0
     for d in dim_order:
-        if guard_size_oblivious(shape[d] == 0):
+        if guard_or_true(shape[d] == 0):
             return False
-        if guard_size_oblivious(strides[d] < min):
+        if guard_or_true(strides[d] < min):
             return False
         if d == 0 and min == strides[1]:
             return False
         min = strides[d]
-        if guard_size_oblivious(strides[d] > 1):
+        # Assume stride is not 1, the consequence is min could be larger than needed,
+        # which would result in returning False for this function but not vice versa,
+        # so it's ok.
+        if guard_or_true(strides[d] > 1):
             min *= shape[d]
     return True
 
@@ -2008,7 +2020,7 @@ def suggest_memory_format(x: TensorLikeType) -> torch.memory_format:
     if x.layout != torch.strided:
         return torch.contiguous_format
 
-    if are_strides_like_channels_last(x.shape, x.stride()):
+    if are_strides_like_channels_last_or_false(x.shape, x.stride()):
         return torch.channels_last if x.ndim == 4 else torch.channels_last_3d
 
     return torch.contiguous_format
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 783e440223796..474f824eacf25 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -1997,9 +1997,13 @@ def clamp_max(
 
 
 # https://pytorch.org/docs/stable/generated/torch.where.html
-# TODO: implement alternate where
-@register_decomposition(aten.where)
-@out_wrapper()
+# TODO: implement where.default
+@register_decomposition(aten.where.self)
+@register_decomposition(aten.where.ScalarSelf)
+@register_decomposition(aten.where.ScalarOther)
+@register_decomposition(aten.where.Scalar)
+@register_decomposition(aten.where.self_out)
+@out_wrapper(exact_dtype=True)
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a", "b"),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
@@ -2259,11 +2263,14 @@ def _reduction(
         dims = (dims,)  # type: ignore[assignment]
     dims = utils.reduction_dims(a.shape, dims)
     if not has_identity:
-        valid_shape = a.ndim == 0 or builtins.all(a.shape[i] for i in dims)
-        if not valid_shape:
-            raise RuntimeError(
-                "reducing over zero-size dimension for reduction operation without identity"
-            )
+        from torch.fx.experimental.symbolic_shapes import sym_and
+
+        valid_shape = a.ndim == 0 or sym_and(*(a.shape[i] > 0 for i in dims))
+        torch._check(
+            valid_shape,
+            lambda: "reducing over zero-size dimension for reduction operation without identity",
+        )
+
     computation_dtype, result_dtype = utils.reduction_dtypes(
         a, output_dtype_kind, dtype
     )
@@ -2988,7 +2995,7 @@ def constant_pad_nd(
         pad_idx = len(pad) - ((i + 1) * 2)
         new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]
         torch._check(
-            new_dim > 0,
+            new_dim >= 0,
             lambda: f"The input size {input_sizes[l_diff + i]}, plus negative padding "
             f"{pad[pad_idx]} and {pad[pad_idx + 1]} resulted in a negative output size, "
             f"which is invalid. Check dimension {l_diff + i} of your input.",
@@ -3132,7 +3139,10 @@ def flatten(a: TensorLikeType, start_dim: int = 0, end_dim: int = -1) -> TensorL
 
     # Tries to take a view
     # TODO: we could look at directing collapse_view to skip its meta function here (unsafe_collapse_view)
-    new_shape, _new_strides = prims._collapse_view_helper(a, start_dim, end_dim)
+    # Unbacked semnatics: if validty of in-place flattening is undecided we copy.
+    new_shape, _new_strides = prims._collapse_view_helper(
+        a, start_dim, end_dim, must_be_valid=None
+    )
     if new_shape is not None:
         return prims.collapse_view(a, start_dim, end_dim)
 
@@ -3840,7 +3850,9 @@ def _reshape_view_helper_core_alg(
             # may return a view of a copy
 
             # Checks if collapse can be a view and short-circuits to copying reshape if it can't
-            new_shape, _new_strides = prims._collapse_view_helper(a_, idx, end)
+            new_shape, _new_strides = prims._collapse_view_helper(
+                a_, idx, end, must_be_valid=None
+            )
             if new_shape is None:
                 if allow_copy:
                     return prims.reshape(a, shape)
@@ -5106,7 +5118,7 @@ def empty_like(
         )
 
     # memory_format == torch.preserve_format
-    logical_to_physical_perm = (
+    logical_to_physical_perm, _ = (
         utils.compute_elementwise_output_logical_to_physical_perm(a)
     )
     # identity perm is [2, 1, 0]
@@ -5573,6 +5585,13 @@ def empty_strided(
     )
 
 
+def _strength_reduce_integer(val: int) -> torch.dtype:
+    for possible_dtype in (torch.uint8, torch.uint16, torch.int32):
+        if val <= torch.iinfo(possible_dtype).max:
+            return possible_dtype
+    return torch.int64
+
+
 @register_decomposition(aten.eye)
 @out_wrapper()
 def eye(
@@ -5594,12 +5613,15 @@ def eye(
     torch._check(n >= 0, lambda: f"n must be greater or equal to 0, got {n}")
     torch._check(m >= 0, lambda: f"m must be greater or equal to 0, got {m}")
 
-    range_n = torch.arange(n, dtype=torch.int64, device=device, requires_grad=False)
-    range_m = torch.arange(m, dtype=torch.int64, device=device, requires_grad=False)
+    range_dtype = torch.int64
+    if isinstance(n, utils.IntWithoutSymInt) and isinstance(m, utils.IntWithoutSymInt):
+        range_dtype = _strength_reduce_integer(max(n, m))
+    range_n = torch.arange(n, dtype=range_dtype, device=device, requires_grad=False)
+    range_m = torch.arange(m, dtype=range_dtype, device=device, requires_grad=False)
 
     cond = range_n.unsqueeze(-1) == range_m
-    if dtype is torch.bool:
-        return cond
+    if layout in (torch.strided, None) and not pin_memory:
+        return cond.to(dtype or torch.get_default_dtype())
     else:
         one = torch.ones(
             (1,),
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 5767f6a1d0c1e..fcb1e9d2a5993 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -135,9 +135,11 @@ class FakeTensorTLS(threading.local):
     # Default to None, otherwise it'll be used to override _all_
     # `FakeTensorMode.allow_non_fake_inputs` in this thread.
     allow_non_fake_inputs_override: Optional[bool]
+    non_strict_export_fake_tensor_tracker: weakref.WeakSet
 
     def __init__(self) -> None:
         self.allow_non_fake_inputs_override = None
+        self.non_strict_export_fake_tensor_tracker = weakref.WeakSet()
 
 
 fake_tensor_tls = FakeTensorTLS()
@@ -791,6 +793,11 @@ def __new__(
     #
     def __init__(self, *args: object, **kwargs: object) -> None:
         super().__init__()
+        if (
+            torch.compiler.is_exporting()
+            and torch._export.config.detect_non_strict_fake_tensor_leaks
+        ):
+            fake_tensor_tls.non_strict_export_fake_tensor_tracker.add(self)
 
     @staticmethod
     def from_tensor(t: Tensor, fake_mode: FakeTensorMode) -> FakeTensor:
@@ -1387,6 +1394,12 @@ def __enter__(self) -> Self:
             # See NOTE: [torch.tensor, lift_fresh, and device movement]
             prev_only_lift_cpu_tensors = torch._C._only_lift_cpu_tensors()
             torch._C._set_only_lift_cpu_tensors(True)
+
+            # In the case of CPU-only build or cuda device unavailable,
+            # we patch the cuda device guard to use NoOpDeviceGuardImpl.
+            # This enables us to trace over cuda kernels under FakeTensorMode.
+            torch._C._ensureCUDADeviceGuardSet()
+
         maybe_prev_fake_mode = torch._C._unset_dispatch_mode(self._mode_key)
         if self is not maybe_prev_fake_mode:
             self.enter_stack.append(
@@ -1397,6 +1410,7 @@ def __enter__(self) -> Self:
             # no-op (still need to re-set the fake mode though since we unset it)
             torch._C._set_dispatch_mode(self)
             self.enter_stack.append((False, None, prev_only_lift_cpu_tensors))
+
         return self
 
     def __exit__(
@@ -2320,11 +2334,28 @@ def _dispatch_impl(
         converter = self.fake_tensor_converter
 
         is_lift_func = func in self.lift_fns
+
+        # If we are trying to avoid device init, then we need to avoid constant
+        # prop on constant tensors for ops that change devices.
+        avoiding_device_init = False
+        if self.avoid_device_init:
+            if (
+                func == torch.ops.aten._to_copy.default
+                and "device" in kwargs
+                and kwargs["device"].type != "cpu"  # type: ignore[attr-defined]
+            ):
+                avoiding_device_init = True
+            if func == torch.ops.prims.device_put.default:
+                avoiding_device_init = True
+
+        # skip const prop for aten._to_copy if
+        # 1. input tensor is on "meta" device
+        # 2. destination device is unavailable, captured by `avoiding_device_init`
         device_conversion_skip_const_prop = (
             func is torch.ops.aten._to_copy.default
             and isinstance(args[0], torch.Tensor)
             and args[0].device.type == "meta"
-        )
+        ) or avoiding_device_init
 
         # To constant propagate through these functions:
         # 1, If this is a lift due to a torch.tensor call,
@@ -2370,19 +2401,6 @@ def _dispatch_impl(
             if type(args[0]) is Tensor:
                 return converter.from_real_tensor(self, args[0])
 
-        # If we are trying to avoid device init, then we need to avoid constant
-        # prop on constant tensors for ops that change devices.
-        avoiding_device_init = False
-        if self.avoid_device_init:
-            if (
-                func == torch.ops.aten._to_copy.default
-                and "device" in kwargs
-                and kwargs["device"] != "cpu"
-            ):
-                avoiding_device_init = True
-            if func == torch.ops.prims.device_put.default:
-                avoiding_device_init = True
-
         # Recompute flat_arg_fake_tensors here again in case some of the inputs
         # were real tensors and fakified in validate_and_convert_non_fake_tensors
         (flat_args, flat_arg_fake_tensors) = self.validate_and_convert_non_fake_tensors(
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 6cebed28b8b0d..ae23989bb3bfc 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -626,6 +626,34 @@ def backward(
             self, gradient, retain_graph, create_graph, inputs=inputs
         )
 
+    def index(self, positions, dims):
+        """
+        Index a regular tensor by binding specified positions to dims.
+
+        This converts a regular tensor to a first-class tensor by binding
+        the specified positional dimensions to Dim objects.
+
+        Args:
+            positions: Tuple of dimension positions to bind
+            dims: Dim objects or tuple of Dim objects to bind to
+
+        Returns:
+            First-class tensor with specified dimensions bound
+        """
+        # TODO: make it possible to dispatch on positions/dims
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.index,
+                (self,),
+                self,
+                positions,
+                dims,
+            )
+
+        from functorch.dim import index
+
+        return index(self, positions, dims)
+
     def register_hook(self, hook):
         r"""Registers a backward hook.
 
@@ -1585,17 +1613,19 @@ def has_multiple_dim_order(tensor):
               If any two dimensions have the same stride, swapping these dimensions won't
               change how data is accessed, leading to multiple correct dimension orders.
             """
+            from torch.fx.experimental.symbolic_shapes import guard_or_false
 
             sizes = tensor.size()
             strides = tensor.stride()
 
             # Check if there are any duplicate strides
             has_duplicate_strides = any(
-                earlier == later for earlier, later in zip(strides, strides[1:])
+                guard_or_false(earlier == later)
+                for earlier, later in zip(strides, strides[1:])
             )
 
             # Check if there are any singleton dimensions
-            has_singleton_dims = any(size == 1 for size in sizes)
+            has_singleton_dims = any(guard_or_false(size == 1) for size in sizes)
 
             return has_duplicate_strides or has_singleton_dims
 
@@ -1615,7 +1645,14 @@ def has_multiple_dim_order(tensor):
 
         import torch._prims_common as utils
 
-        return tuple(utils.compute_elementwise_output_logical_to_physical_perm(self))
+        out_perm, raise_ambiguity = (
+            utils.compute_elementwise_output_logical_to_physical_perm(
+                self, ambiguity_check=ambiguity_check
+            )
+        )
+        if raise_ambiguity:
+            raise RuntimeError("The tensor does not have unique dim order.")
+        return tuple(out_perm)
 
     def _update_names(self, names, inplace):
         if has_torch_function_unary(self):
@@ -1662,7 +1699,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
     def __dlpack__(
         self,
         *,
-        stream: Optional[Any] = None,
+        stream: Optional[Any] = -1,
         max_version: Optional[tuple[int, int]] = None,
         dl_device: Optional[tuple[enum.IntEnum, int]] = None,
         copy: Optional[bool] = None,
@@ -1680,9 +1717,12 @@ def __dlpack__(
                 pointer to a CUDA stream. The current stream is synchronized with
                 this stream before the capsule is created, and since the capsule
                 shares its storage with the tensor this make it safe to access from
-                both streams.  If None or -1 is passed then no synchronization is performed.
+                both streams.  If -1 is passed then no synchronization is performed.
                 If 1 (on CUDA) or 0 (on ROCM) then the default stream is used for
-                synchronization.
+                synchronization. This API intentionally slightly deviates from the DLPack
+                guidance: the default stream is -1 (stream-preserving; no cross-stream sync),
+                because many from_dlpack implementations intend stream preservation.
+                For non-CUDA devices, -1 is treated the same as None.
 
             max_version (tuple[int, int] or None): An optional Python tuple with
                 2 integers, representing the maximum version the caller supports. If
@@ -1760,7 +1800,7 @@ def __dlpack__(
                 event.record(current_stream)
                 stream.wait_event(event)
         elif self.device.type == "cpu":
-            assert stream is None, "stream should be None on cpu."
+            assert stream is None or stream == -1, "stream should be None on cpu."
 
         if self.device.type == "xla":
             import torch_xla
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index f20a88ce85402..839c50d12d565 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -176,6 +176,9 @@ def log_torch_jit_trace_exportability(
     return
 
 
+DISABLE_JUSTKNOBS = True
+
+
 def justknobs_check(name: str, default: bool = True) -> bool:
     """
     This function can be used to killswitch functionality in FB prod,
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
index c758d47fc8150..81903d37e5de3 100644
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@@ -232,11 +232,11 @@ def __init__(
             )
         if dtype is None:
             dtype = torch.get_autocast_dtype(device_type)
+        self.fast_dtype = dtype
         if torch._jit_internal.is_scripting():
             self._enabled = enabled
             self.device = device_type
-            self.fast_dtype = dtype
-            assert dtype is not None
+            assert self.fast_dtype is not None
             return
         self.device = device_type
         if not is_autocast_available(self.device):
@@ -244,7 +244,6 @@ def __init__(
                 f"User specified an unsupported autocast device_type '{self.device}'"
             )
         self.custom_backend_name = torch._C._get_privateuse1_backend_name()
-        self.fast_dtype = torch.get_autocast_dtype(self.device)
         if self.device == self.custom_backend_name:
             necessary_funcs = [
                 "get_amp_supported_dtype",
@@ -271,8 +270,6 @@ def __init__(
                 "User provided device_type of 'cuda', but CUDA is not available. Disabling"
             )
             enabled = False
-        if dtype is not None:
-            self.fast_dtype = dtype
         if cache_enabled is not None:
             self._cache_enabled = cache_enabled
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/README.md b/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/README.md
index f36342edf0b4a..7e33e05341602 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/README.md
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/README.md
@@ -3,7 +3,7 @@
 **These are callback scripts for lightning and does not introduce pytorch lightning dependency on PyTorch.**
 
 ## Introduction
-Callbacks for PytorchLightning that specifies on when and how to to sparsify the data weights of the model.
+Callbacks for PytorchLightning that specifies on when and how to sparsify the data weights of the model.
 
 ## Types of Data Sparsity Callbacks
 There are 2 types of data sparsity callbacks
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index f8445da5fea19..6e68bfd4648e3 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import copy
+import functools
 import operator
 import warnings
 from collections import namedtuple
@@ -245,6 +246,7 @@ def load_arg(a):
 
 
 # TODO: delete
+@functools.cache
 def assert_and_get_unique_device(module: torch.nn.Module) -> Any:
     """
     Returns the unique device for a module, or None if no device is found.
diff --git a/torch/ao/quantization/pt2e/duplicate_dq_pass.py b/torch/ao/quantization/pt2e/duplicate_dq_pass.py
index 163184c00f1d1..34a95eb80fb22 100644
--- a/torch/ao/quantization/pt2e/duplicate_dq_pass.py
+++ b/torch/ao/quantization/pt2e/duplicate_dq_pass.py
@@ -1,4 +1,3 @@
-# mypy: allow-untyped-defs
 import logging
 import operator
 
@@ -31,7 +30,7 @@
 
 def _maybe_duplicate_dq(
     gm: torch.fx.GraphModule, dq_node: torch.fx.Node, user: torch.fx.Node
-):
+) -> None:
     annotation = user.meta.get("quantization_annotation", None)
     if not _is_valid_annotation(annotation):  # type: ignore[arg-type]
         return
diff --git a/torch/ao/quantization/pt2e/lowering.py b/torch/ao/quantization/pt2e/lowering.py
index 587cee22560df..742549dedcf8d 100644
--- a/torch/ao/quantization/pt2e/lowering.py
+++ b/torch/ao/quantization/pt2e/lowering.py
@@ -50,7 +50,7 @@ def _node_replace(m):  # type: ignore[no-untyped-def]
         m.recompile()
 
     lowered_model = (
-        torch.export.export_for_training(model, example_inputs, strict=True)
+        torch.export.export(model, example_inputs, strict=True)
         .run_decompositions(_post_autograd_decomp_table())
         .module()
     )
diff --git a/torch/ao/quantization/pt2e/utils.py b/torch/ao/quantization/pt2e/utils.py
index 699a4c384837d..ae938fec4c7f9 100644
--- a/torch/ao/quantization/pt2e/utils.py
+++ b/torch/ao/quantization/pt2e/utils.py
@@ -356,7 +356,7 @@ def _get_aten_graph_module_for_pattern(
             [x.cuda() if isinstance(x, torch.Tensor) else x for x in example_inputs]
         )
 
-    aten_pattern = torch.export.export_for_training(
+    aten_pattern = torch.export.export(
         pattern,  # type: ignore[arg-type]
         example_inputs,
         kwargs,
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index e92f38b3af38b..c4148a126d1e0 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -237,6 +237,12 @@ class inference_mode(_DecoratorContextManager):
        Unlike some other mechanisms that locally enable or disable grad,
        entering inference_mode also disables :ref:`forward-mode AD <forward-mode-ad>`.
 
+    .. warning::
+        `inference_mode` does NOT automatically set the model to evaluation mode.
+        For proper inference behavior (e.g., disabling dropout, using running statistics
+        in batch normalization), you must explicitly set your model to evaluation mode using
+        `model.eval()` in addition to using this context manager.
+
     Args:
         mode (bool or function): Either a boolean flag to enable or disable
             inference mode, or a Python function to decorate with inference
diff --git a/torch/compiler/_cache.py b/torch/compiler/_cache.py
index 054ab1bb9fb2c..ecc23d87fb96b 100644
--- a/torch/compiler/_cache.py
+++ b/torch/compiler/_cache.py
@@ -48,9 +48,6 @@ def encode(content: Any) -> bytes:
     def populate_cache(self) -> None:
         pass
 
-    def precompile_compatible(self) -> bool:
-        return False
-
     @staticmethod
     def type() -> str:
         """
@@ -131,14 +128,6 @@ def aot_autograd_artifacts(self) -> list[str]:  # type: ignore[empty-body]
     def pgo_artifacts(self) -> list[str]:  # type: ignore[empty-body]
         ...
 
-    @property
-    def precompile_aot_autograd_artifacts(self) -> list[str]:  # type: ignore[empty-body]
-        ...
-
-    @property
-    def precompile_dynamo_artifacts(self) -> list[str]:  # type: ignore[empty-body]
-        ...
-
     def add(self, artifact: CacheArtifact) -> None:
         self.artifacts[artifact.type()].append(artifact.key)
 
diff --git a/torch/csrc/DataLoader.cpp b/torch/csrc/DataLoader.cpp
index 216360b63d65f..a6ad3f00b2782 100644
--- a/torch/csrc/DataLoader.cpp
+++ b/torch/csrc/DataLoader.cpp
@@ -62,7 +62,7 @@ static void setSignalHandler(
     std::ostringstream oss;
     oss << "An error occurred while setting handler for " << strsignal(signal)
         << ".";
-    throw std::runtime_error(oss.str());
+    TORCH_CHECK(false, oss.str());
   }
 }
 
@@ -141,29 +141,32 @@ static PyObject* THPModule_errorIfAnyWorkerFails(
         continue;
       if (infop.si_code == CLD_EXITED &&
           infop.si_status != EXIT_SUCCESS) { // exit with error
-        std::ostringstream oss;
-        oss << "DataLoader worker (pid " << worker_pid << ") exited "
-            << "unexpectedly with exit code " << infop.si_status << ". "
-            << "Details are lost due to multiprocessing. Rerunning with "
-            << "num_workers=0 may give better error trace.";
+        auto error_msg = fmt::format(
+            "DataLoader worker (pid {}) exited unexpectedly with exit code {}. "
+            "Details are lost due to multiprocessing. Rerunning with "
+            "num_workers=0 may give better error trace.",
+            worker_pid,
+            infop.si_status);
         // This is necessary. Otherwise, the runtime error will kill the other
         // workers, and trigger this again.
         pid_set.clear();
-        throw std::runtime_error(oss.str());
+        TORCH_CHECK(false, error_msg);
       } else if (
           infop.si_code == CLD_KILLED ||
           infop.si_code == CLD_DUMPED) { // killed by signal
-        std::ostringstream oss;
-        oss << "DataLoader worker (pid " << worker_pid << ") is killed "
-            << "by signal: " << strsignal(infop.si_status) << ". ";
+        auto error_msg = fmt::format(
+            "DataLoader worker (pid {}) is killed by signal: {}. ",
+            worker_pid,
+            strsignal(infop.si_status));
         if (infop.si_status == SIGBUS) {
-          oss << "It is possible that dataloader's workers are out of shared memory. "
-              << "Please try to raise your shared memory limit.";
+          error_msg +=
+              "It is possible that dataloader's workers are out of shared memory. "
+              "Please try to raise your shared memory limit.";
         }
         // This is necessary. Otherwise, the runtime error will kill the other
         // workers, and trigger this again.
         pid_set.clear();
-        throw std::runtime_error(oss.str());
+        TORCH_CHECK(false, error_msg);
       }
     }
   }
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
index 53aca5ae8e31b..a86bf6cfa3131 100644
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@@ -67,10 +67,11 @@ static PyObject* THPDevice_pynew(
     auto as_device = r.device(0); // this works, because device can take strings
     if (as_device.has_index()) {
       auto device_type = r.string(0);
-      throw std::runtime_error(
+      TORCH_CHECK(
+          false,
           "type (string) must not include an index because index "
           "was passed explicitly: " +
-          device_type);
+              device_type);
     }
     int64_t device_index = -1;
     if (!r.isNone(1)) {
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 6f052b0331edc..d040e16ba5283 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -26,6 +26,7 @@
 #include <ATen/native/Normalization.h>
 #include <c10/core/Device.h>
 #include <c10/core/DispatchKeySet.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/util/AbortHandler.h>
 #include <c10/util/Backtrace.h>
 #include <c10/util/Logging.h>
@@ -120,12 +121,10 @@
 #endif
 #endif
 
-#ifdef USE_C10D
 #include <torch/csrc/distributed/autograd/python_autograd.h>
 #include <torch/csrc/distributed/c10d/c10d.h>
 #include <torch/csrc/distributed/rpc/rpc.h>
 #include <torch/csrc/distributed/rpc/testing/testing.h>
-#endif
 
 #if defined(USE_VALGRIND)
 #include <callgrind.h>
@@ -1552,6 +1551,15 @@ static PyObject* THPModule_are_vmap_fallback_warnings_enabled(
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THCPModule_ensureCUDADeviceGuardSet(
+    PyObject* self,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  c10::impl::ensureCUDADeviceGuardSet();
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 static std::initializer_list<PyMethodDef> TorchMethods = {
     {"_initExtension", THPModule_initExtension, METH_O, nullptr},
     {"_autograd_init", THPAutograd_initExtension, METH_NOARGS, nullptr},
@@ -1847,7 +1855,13 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
      (PyCFunction)(void (*)())THPModule_has_torch_function_variadic,
      METH_FASTCALL,
      nullptr},
-    {nullptr, nullptr, 0, nullptr}};
+    {"_ensureCUDADeviceGuardSet",
+     THCPModule_ensureCUDADeviceGuardSet,
+     METH_NOARGS,
+     nullptr},
+    {nullptr, nullptr, 0, nullptr}
+
+};
 
 #ifdef USE_CUDA
 // NOLINTBEGIN(misc-use-internal-linkage)
@@ -1987,7 +2001,6 @@ PyObject* initModule() {
 #ifdef USE_XPU
   THPUtils_addPyMethodDefs(methods, THXPModule_methods());
 #endif
-#ifdef USE_C10D
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::c10d::python_functions());
 #ifndef _WIN32
@@ -1997,7 +2010,6 @@ PyObject* initModule() {
       methods, torch::distributed::autograd::python_functions());
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::rpc::testing::python_functions());
-#endif
 #endif
 
   static struct PyModuleDef torchmodule = {
diff --git a/torch/csrc/api/include/torch/nn/modules/loss.h b/torch/csrc/api/include/torch/nn/modules/loss.h
index 52be4f612b59f..76b35621c37f8 100644
--- a/torch/csrc/api/include/torch/nn/modules/loss.h
+++ b/torch/csrc/api/include/torch/nn/modules/loss.h
@@ -703,7 +703,7 @@ struct TORCH_API NLLLossImpl : public Cloneable<NLLLossImpl> {
   /// The options with which this `Module` was constructed.
   NLLLossOptions options;
 
-  /// A manual rescaling weight given to to each class.
+  /// A manual rescaling weight given to each class.
   Tensor weight;
 };
 
@@ -743,7 +743,7 @@ struct TORCH_API CrossEntropyLossImpl : public Cloneable<CrossEntropyLossImpl> {
   /// The options with which this `Module` was constructed.
   CrossEntropyLossOptions options;
 
-  /// A manual rescaling weight given to to each class.
+  /// A manual rescaling weight given to each class.
   Tensor weight;
 };
 
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 6880caddc8d25..15fbf5f3ffa83 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -264,16 +264,38 @@ struct AddGenericMetadata : public MetadataBase {
         continue;
       }
 
-      // Until needed, lets limit the kwargs to only ints, doubles, strings and
-      // bools
-      if (!val.isInt() && !val.isDouble() && !val.isString() && !val.isBool()) {
-        LOG(WARNING) << "Inputted kwarg: " << key
-                     << " is not an int, double, string, or bool for op: "
-                     << op_event.name_ << " skipping";
+      // Until needed, lets limit the kwargs to only ints, doubles, strings,
+      // bools, and list of strings
+      bool isValidType =
+          val.isInt() || val.isDouble() || val.isString() || val.isBool();
+      bool isStringList = false;
+
+      if (!isValidType && val.isList()) {
+        // Check if it's a list of strings
+        auto list = val.toListRef();
+        isStringList =
+            std::all_of(list.begin(), list.end(), [](const c10::IValue& item) {
+              return item.isString();
+            });
+      }
+
+      if (!isValidType && !isStringList) {
+        LOG(WARNING)
+            << "Inputted kwarg: " << key
+            << " is not an int, double, string, bool, or list of strings for op: "
+            << op_event.name_ << " skipping";
         continue;
       }
-      bool isString = val.isString();
-      addMetadata(key, ivalueToStr(val, isString));
+
+      if (isStringList) {
+        // For list of strings, use ivalueListToStr
+        auto list = val.toListRef();
+        std::vector<c10::IValue> stringList(list.begin(), list.end());
+        addMetadata(key, ivalueListToStr(stringList));
+      } else {
+        bool isString = val.isString();
+        addMetadata(key, ivalueToStr(val, isString));
+      }
     }
     // Add extra metadata if any
     for (const auto& [key, val] : op_event.extra_meta_) {
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index 78a0c6eeec7ac..0e895312cbd12 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -167,7 +167,7 @@ class CallTypeHelper final {
 //
 // During post processing we:
 //   1) Determine the type represented by a TraceKey by checking which
-//      sub-cache it appears in in the thread local cache.
+//      sub-cache it appears in the thread local cache.
 //   2) Look up the pair of CallKeys from the thread local cache.
 //   3) Look up the expanded values of each CallKey from the global value cache.
 //
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index 32f2cc34cf3de..bde322c908155 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -57,10 +57,6 @@ PythonEngine::~PythonEngine() {
   Engine::stop();
 }
 
-#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 9
-#define IS_PYTHON_3_9_PLUS
-#endif
-
 void PythonEngine::thread_init(
     int device,
     const std::shared_ptr<ReadyQueue>& ready_queue,
@@ -72,11 +68,7 @@ void PythonEngine::thread_init(
   // Create a PyThreadState, but release the GIL. This lets
   // pybind11::gil_scoped_acquire calls inside thread_main acquire the GIL
   // without having to create a new PyThreadState each time.
-#if defined(IS_PYTHON_3_9_PLUS)
   auto gil = std::make_unique<pybind11::gil_scoped_acquire>();
-#else
-  pybind11::gil_scoped_acquire gil;
-#endif
   pybind11::gil_scoped_release no_gil;
   Engine::thread_init(device, ready_queue, false);
 
@@ -85,7 +77,6 @@ void PythonEngine::thread_init(
     decrement_non_reentrant_thread_count();
   }
 
-#if defined(IS_PYTHON_3_9_PLUS)
   // Do not call PyEval_RestoreThread, PyThreadState_[Clear|DeleteCurrent] if
   // runtime is finalizing
   if (!Py_IsInitialized()) {
@@ -96,7 +87,6 @@ void PythonEngine::thread_init(
     auto ptr = gil.release();
     operator delete(ptr);
   }
-#endif
 }
 
 void PythonEngine::thread_on_exception(
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 712719304ad63..bf0ddf287c945 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -628,7 +628,7 @@ static PyObject* THPVariable_make_subclass(
 }
 
 // Shared code factored out of THPVariable_make_wrapper_subclass and
-// THPVariable_make_dtensor.
+// THPVariable_dtensor__new__.
 static Tensor make_tensor_for_subclass_helper(
     SymIntArrayRef sym_sizes,
     OptionalSymIntArrayRef sym_strides,
@@ -660,10 +660,9 @@ static Tensor make_tensor_for_subclass_helper(
   Storage storage{
       Storage::use_byte_size_t{},
       size_bytes,
+      at::DataPtr{nullptr, options.device()},
       /*allocator=*/c10::GetAllocator(c10::kMeta),
       /*resizable=*/true};
-  // TODO: constructor should probably accept data pointer
-  storage.set_data_ptr_noswap(at::DataPtr{nullptr, options.device()});
 
   auto keys = c10::DispatchKeySet({options.computeDispatchKey()});
   if (extra_dispatch_keys.has_value()) {
@@ -788,18 +787,122 @@ static PyObject* THPVariable_make_wrapper_subclass(
   END_HANDLE_TH_ERRORS
 }
 
+static py::handle get_dtensor_spec_class() {
+#if IS_PYBIND_2_13_PLUS
+  PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object>
+      storage;
+  return storage
+      .call_once_and_store_result([]() -> py::object {
+        return py::module::import("torch")
+            .attr("distributed")
+            .attr("tensor")
+            .attr("_dtensor_spec")
+            .attr("DTensorSpec");
+      })
+      .get_stored();
+#else
+  static py::handle dtensor_spec_class = py::object(py::module::import("torch")
+                                                        .attr("distributed")
+                                                        .attr("tensor")
+                                                        .attr("_dtensor_spec")
+                                                        .attr("DTensorSpec"))
+                                             .release();
+  return dtensor_spec_class;
+#endif
+}
+
+static bool arg_type_tensor_or_tensor_list_like(py::handle arg) {
+  const auto dtensor_spec_class = get_dtensor_spec_class();
+  if (py::isinstance(arg, dtensor_spec_class)) {
+    return true;
+  }
+  if (!PyList_Check(arg.ptr())) {
+    return false;
+  }
+  py::list arg_list = py::reinterpret_borrow<py::list>(arg);
+  for (const auto e : arg_list) {
+    if (!e.is_none() && !py::isinstance(e, dtensor_spec_class)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define FOR_EACH_DTENSOR_INTERNED_STRING(_) \
+  _(_comparison_key)                        \
+  _(_local_tensor)                          \
+  _(_spec)                                  \
+  _(args_schema)                            \
+  _(has_symints)                            \
+  _(kwargs_schema)                          \
+  _(op)                                     \
+  _(schema_info)                            \
+  _(shape)                                  \
+  _(static_argnum)                          \
+  _(static_kwargkey)                        \
+  _(stride)                                 \
+  _(tensor_meta)
+
+struct DTensorInternedStrings {
+#define DECLARE_INTERNED_STRING_VARIABLE(s) PyObject* s;
+  FOR_EACH_DTENSOR_INTERNED_STRING(DECLARE_INTERNED_STRING_VARIABLE)
+#undef DECLARE_INTERNED_STRING_VARIABLE
+};
+
+static DTensorInternedStrings dtensor_interned_strings;
+
+static bool intern_dtensor_strings() {
+#define INTERN_DTENSOR_STRING(s)                                           \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dtensor_interned_strings.s == nullptr); \
+  dtensor_interned_strings.s = PyUnicode_InternFromString(#s);             \
+  if (dtensor_interned_strings.s == nullptr) {                             \
+    return false;                                                          \
+  }
+
+  FOR_EACH_DTENSOR_INTERNED_STRING(INTERN_DTENSOR_STRING);
+#undef INTERN_DTENSOR_STRING
+  return true;
+}
+
+static bool checked_not(PyObject* obj) {
+  int result = PyObject_Not(obj);
+  if (result == -1) {
+    throw py::error_already_set();
+  }
+  return result;
+}
+
+static c10::SymDimVector tuple_to_symintlist(PyObject* obj) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(PyTuple_Check(obj));
+  c10::SymDimVector res;
+  const auto size = PyTuple_GET_SIZE(obj);
+  res.reserve(size);
+  for (const auto idx : c10::irange(size)) {
+    PyObject* item = PyTuple_GET_ITEM(obj, idx);
+    if (THPUtils_checkLongExact(item)) {
+      res.emplace_back(THPUtils_unpackLong(item));
+    } else if (torch::is_symint(py::handle(item))) {
+      res.push_back(py::handle(item).cast<c10::SymInt>());
+    } else {
+      // N.B. torch.Tensor.__index__ exists, so this should handle
+      // scalar Tensors fine.
+      res.emplace_back(THPUtils_unpackIndex(item));
+    }
+  }
+  return res;
+}
+
 // DTensor-specific variant of make_wrapper_subclass to minimize DTensor
 // overhead.
-static PyObject* THPVariable_make_dtensor(
+static PyObject* THPVariable_dtensor_new(
     PyObject*,
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
-      "_make_dtensor(PyObject* cls, SymIntArrayRef size, SymIntArrayRef strides, "
-      "Tensor local_tensor, bool requires_grad)",
+      "_dtensor__new__(PyObject* cls, Tensor local_tensor, PyObject* spec, bool requires_grad)",
   });
-  ParsedArgs<5> parsed_args{};
+  ParsedArgs<4> parsed_args{};
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
 
@@ -808,16 +911,26 @@ static PyObject* THPVariable_make_dtensor(
       "cls must be a type (got ",
       Py_TYPE(cls)->tp_name,
       ")");
-  // See note about the __torch_dispatch__ check in
-  // THPVariable_make_wrapper_subclass above.
+
+#ifndef NDEBUG
+  // This is specifically for making a DTensor, which we know defines
+  // __torch_dispatch__. Check anyway in debug builds in case somebody
+  // removes it.
   py::object attr = PyObject_FastGetAttrString(cls, "__torch_dispatch__");
   TORCH_CHECK_TYPE(
       attr.ptr() != nullptr &&
           attr.ptr() != torch::disabled_torch_dispatch_impl(),
       ((PyTypeObject*)cls)->tp_name,
       " must define __torch_dispatch__");
+#endif
 
-  const auto& local_tensor = r.tensor(3);
+  const auto& local_tensor = r.tensor(1);
+  const bool requires_grad = r.toBool(3);
+  if (local_tensor.requires_grad() && !requires_grad) {
+    TORCH_WARN(
+        "To construct DTensor from torch.Tensor, it's recommended to use "
+        "local_tensor.detach() and make requires_grad consistent.");
+  }
   const auto options = TensorOptions()
                            .dtype(local_tensor.dtype())
                            .device(local_tensor.device())
@@ -832,22 +945,190 @@ static PyObject* THPVariable_make_dtensor(
     extra_dispatch_keys = extra_dispatch_keys.add(c10::DispatchKey::Negative);
   }
 
+  py::handle spec = py::handle(r.pyobject(2));
+  const auto tensor_meta = spec.attr(dtensor_interned_strings.tensor_meta);
+  TORCH_CHECK(!tensor_meta.is_none());
+  const auto sizes = tensor_meta.attr(dtensor_interned_strings.shape);
+  TORCH_CHECK(
+      PyTuple_Check(sizes.ptr()), "spec.tensor_meta.shape must be a tuple");
+  const auto stride = tensor_meta.attr(dtensor_interned_strings.stride);
+  TORCH_CHECK(
+      PyTuple_Check(stride.ptr()), "spec.tensor_meta.stride must be a tuple");
+
   Tensor tensor = make_tensor_for_subclass_helper(
-      /*sym_sizes=*/r.symintlist(1),
-      /*sym_strides=*/r.symintlist(2),
+      /*sym_sizes=*/tuple_to_symintlist(sizes.ptr()),
+      /*sym_strides=*/tuple_to_symintlist(stride.ptr()),
       /*sym_storage_offset=*/std::nullopt,
       options,
       /*storage_size=*/std::nullopt,
       extra_dispatch_keys);
-  tensor.set_requires_grad(r.toBool(4));
-  return THPVariable_NewWithVar(
-      (PyTypeObject*)cls,
-      tensor,
-      // false is the default
-      /*allow_preexisting_pyobj=*/false,
-      // we know DTensor has __torch_dispatch__ and we double-checked
-      // above; avoid checking again.
-      /*has_torch_dispatch_if_known=*/true);
+  tensor.set_requires_grad(requires_grad);
+  py::object py_tensor =
+      py::reinterpret_steal<py::object>(THPVariable_NewWithVar(
+          (PyTypeObject*)cls,
+          tensor,
+          // false is the default
+          /*allow_preexisting_pyobj=*/false,
+          // we know DTensor has __torch_dispatch__; avoid checking again.
+          /*has_torch_dispatch_if_known=*/true));
+  py_tensor.attr(dtensor_interned_strings._spec) = spec;
+  py_tensor.attr(dtensor_interned_strings._local_tensor) = local_tensor;
+  return py_tensor.release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+static bool DTensor_OpSchema_recompute_comparison_key_impl(
+    PyObject* self,
+    const py::tuple& args_schema) {
+  py::object static_kwargkey;
+  size_t static_argnum = 0;
+  const py::handle self_handle = py::handle(self);
+  const py::handle schema_info =
+      self_handle.attr(dtensor_interned_strings.schema_info);
+  if (checked_not(schema_info.ptr())) {
+    static_argnum = args_schema.size();
+    static_kwargkey = py::none();
+  } else {
+    static_argnum = py::cast<size_t>(
+        schema_info.attr(dtensor_interned_strings.static_argnum));
+    static_kwargkey =
+        schema_info.attr(dtensor_interned_strings.static_kwargkey);
+  }
+  c10::SmallVector<py::object, 8> args_to_hash;
+  size_t idx = 0;
+  for (const auto& e : args_schema) {
+    if (idx >= static_argnum || arg_type_tensor_or_tensor_list_like(e)) {
+      if (PyList_Check(e.ptr())) {
+        args_to_hash.push_back(
+            py::reinterpret_steal<py::object>(PyList_AsTuple(e.ptr())));
+      } else {
+        args_to_hash.push_back(py::reinterpret_borrow<py::object>(e));
+      }
+    }
+    idx++;
+  }
+  py::tuple args_to_hash_tup(args_to_hash.size());
+  for (const auto idx : c10::irange(args_to_hash.size())) {
+    args_to_hash_tup[idx] = std::move(args_to_hash[idx]);
+  }
+  PyObject* comparison_key = nullptr;
+  if (!static_kwargkey.is_none()) {
+    if (!PyList_Check(static_kwargkey.ptr())) {
+      PyErr_SetString(
+          PyExc_TypeError, "self.schema_info.static_kwargkey must be a list!");
+      return false;
+    }
+    py::list static_kwargkey_list =
+        py::reinterpret_borrow<py::list>(static_kwargkey);
+    auto raw_kwargs_schema =
+        self_handle.attr(dtensor_interned_strings.kwargs_schema);
+    if (!PyDict_Check(raw_kwargs_schema.ptr())) {
+      PyErr_SetString(PyExc_TypeError, "self.kwargs_schema must be a dict!");
+      return false;
+    }
+    py::tuple kwargs_to_hash(static_kwargkey_list.size());
+    int idx = 0;
+    auto kwargs_schema = py::reinterpret_borrow<py::dict>(raw_kwargs_schema);
+    for (const auto& k : static_kwargkey_list) {
+      PyObject* item = PyDict_GetItemWithError(kwargs_schema.ptr(), k.ptr());
+      if (item) {
+        kwargs_to_hash[idx++] = py::reinterpret_borrow<py::object>(item);
+      } else if (PyErr_Occurred()) {
+        return false;
+      } else {
+        kwargs_to_hash[idx++] = py::none();
+      }
+    }
+    comparison_key = PyTuple_Pack(
+        3,
+        self_handle.attr(dtensor_interned_strings.op).ptr(),
+        args_to_hash_tup.ptr(),
+        kwargs_to_hash.ptr());
+  } else {
+    comparison_key = PyTuple_Pack(
+        2,
+        self_handle.attr(dtensor_interned_strings.op).ptr(),
+        args_to_hash_tup.release().ptr());
+  }
+  if (!comparison_key) {
+    return false;
+  }
+  self_handle.attr(dtensor_interned_strings._comparison_key) =
+      py::reinterpret_steal<py::object>(comparison_key);
+
+  return true;
+}
+
+static PyObject* DTensor_OpSchema_recompute_comparison_key(
+    PyObject* mod,
+    PyObject* self) {
+  HANDLE_TH_ERRORS
+  const py::handle self_handle = py::handle(self);
+  const py::handle raw_args_schema =
+      self_handle.attr(dtensor_interned_strings.args_schema);
+  if (!PyTuple_Check(raw_args_schema.ptr())) {
+    PyErr_SetString(PyExc_TypeError, "DTensor.args_schema must be a tuple!");
+    return nullptr;
+  }
+  py::tuple args_schema = py::reinterpret_borrow<py::tuple>(raw_args_schema);
+  if (!DTensor_OpSchema_recompute_comparison_key_impl(self, args_schema)) {
+    return nullptr;
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* DTensor_OpSchema_post_init(PyObject* mod, PyObject* self) {
+  HANDLE_TH_ERRORS
+  const py::handle self_handle = py::handle(self);
+  const py::handle raw_args_schema =
+      self_handle.attr(dtensor_interned_strings.args_schema);
+  if (!PyTuple_Check(raw_args_schema.ptr())) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "DTensor_OpSchema_post_init requires self.args_schema to be a tuple!");
+    return nullptr;
+  }
+  py::tuple args_schema = py::reinterpret_borrow<py::tuple>(raw_args_schema);
+  if (!DTensor_OpSchema_recompute_comparison_key_impl(self, args_schema)) {
+    return nullptr;
+  }
+
+  const auto dtensor_spec_class = get_dtensor_spec_class();
+  bool has_symints = false;
+  for (const auto& a : args_schema) {
+    if (Py_TYPE(a.ptr()) != (PyTypeObject*)(dtensor_spec_class.ptr()) &&
+        !py::isinstance(a, dtensor_spec_class)) {
+      continue;
+    }
+    const py::handle tensor_meta = a.attr(dtensor_interned_strings.tensor_meta);
+    if (tensor_meta.is_none()) {
+      continue;
+    }
+    const auto contains_any_symint = [](const py::tuple& sequence) {
+      for (const auto& s : sequence) {
+        if (THPUtils_checkLong(s.ptr())) {
+          continue;
+        }
+        if (torch::is_symint(s)) {
+          return true;
+        }
+      }
+      return false;
+    };
+    // Specifically it's supposed to be torch.Size.
+    py::object raw_shape = tensor_meta.attr(dtensor_interned_strings.shape);
+    if (!PyTuple_Check(raw_shape.ptr())) {
+      PyErr_SetString(PyExc_TypeError, "OpSchema.shape must be a tuple!");
+      return nullptr;
+    }
+    const auto shape = py::reinterpret_steal<py::tuple>(raw_shape.release());
+    if (contains_any_symint(shape)) {
+      has_symints = true;
+    }
+  }
+  self_handle.attr(dtensor_interned_strings.has_symints) = has_symints;
+  Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
@@ -1741,8 +2022,8 @@ static PyMethodDef extra_methods[] = {
      castPyCFunctionWithKeywords(THPVariable_make_wrapper_subclass),
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
-    {"_make_dtensor",
-     castPyCFunctionWithKeywords(THPVariable_make_dtensor),
+    {"_dtensor__new__",
+     castPyCFunctionWithKeywords(THPVariable_dtensor_new),
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
     {"_fix_weakref", THPVariable_fix_weakref, METH_NOARGS, nullptr},
@@ -1761,6 +2042,18 @@ static PyMethodDef extra_methods[] = {
     {"_use_count", THPVariable__use_count, METH_NOARGS, nullptr},
     {nullptr}};
 
+// NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
+static PyMethodDef extra_functions[] = {
+    {"_DTensor_OpSchema_post_init",
+     DTensor_OpSchema_post_init,
+     METH_O,
+     nullptr},
+    {"_DTensor_OpSchema_recompute_comparison_key",
+     DTensor_OpSchema_recompute_comparison_key,
+     METH_O,
+     nullptr},
+    {nullptr}};
+
 struct THPVariableMeta {
   PyHeapTypeObject base;
 };
@@ -2488,5 +2781,10 @@ bool THPVariable_initModule(PyObject* module) {
   torch::autograd::initTorchFunctions(module);
   torch::autograd::initTensorImplConversion(module);
   torch::utils::validate_numpy_for_dlpack_deleter_bug();
+
+  if (!intern_dtensor_strings()) {
+    return false;
+  }
+  PyModule_AddFunctions(module, extra_functions);
   return true;
 }
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index e618ee703378f..1a1a12ec20a72 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -109,7 +109,9 @@ static int64_t count_specified_dimensions(PyObject* index) {
       }
     } else {
       // Check sequences for __torch_function__ (top-level only)
-      if (PySequence_Check(obj)) {
+      // NB: do NOT use PySequence_Check, that will grab things like Numpy
+      // arrays
+      if (PyTuple_Check(obj) || PyList_Check(obj)) {
         if (sequence_has_torch_function(obj)) {
           return -1; // Signal torch function handling needed
         }
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
index 43606807c6e45..3fbe6f906db4e 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -7,23 +7,6 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
-CUDAPluggableAllocatorDeleterContext::CUDAPluggableAllocatorDeleterContext(
-    std::function<FreeFuncType> free_fn,
-    void* data,
-    size_t size,
-    int device,
-    cudaStream_t stream)
-    : free_fn_(std::move(free_fn)),
-      data_(data),
-      size_(size),
-      device_(device),
-      stream_(stream) {}
-
-void CUDAPluggableAllocatorDeleterContext::free() {
-  free_fn_(data_, size_, device_, stream_);
-  delete this;
-}
-
 int device_count = 0;
 
 void custom_raw_deleter(void* ptr);
@@ -41,8 +24,8 @@ _AllocationMetadata::_AllocationMetadata(
 // This avoids having to link against libtorch for C++ based custom allocators
 // And also use this from python
 CUDAPluggableAllocator::CUDAPluggableAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn)
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn)
     : alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}
 
 CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
@@ -114,10 +97,8 @@ c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
   C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
   void* r = this->malloc(size, device, stream);
-  auto* ctx = new CUDAPluggableAllocatorDeleterContext(
-      free_fn_, r, size, device, stream);
   c10::DataPtr data_ptr = {
-      r, ctx, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
+      r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
   return data_ptr;
 }
 
@@ -382,8 +363,8 @@ getCurrentAllocator() {
 // TODO: add more functions in the argument
 std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn) {
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn) {
   std::shared_ptr<CUDAPluggableAllocator> allocator(
       new CUDAPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
   allocator->init(device_count);
@@ -400,8 +381,8 @@ void changeCurrentAllocator(
   current_custom_allocator = allocator;
 }
 
-void custom_raw_deleter(void* ctx) {
-  reinterpret_cast<CUDAPluggableAllocatorDeleterContext*>(ctx)->free();
+void custom_raw_deleter(void* ptr) {
+  current_custom_allocator->raw_delete(ptr);
 }
 
 } // namespace torch::cuda::CUDAPluggableAllocator
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
index 5a1b7be0a15dd..d4f73117eca61 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -11,32 +11,6 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
-using MallocFuncType = void*(size_t, int, cudaStream_t);
-using FreeFuncType = void(void*, size_t, int, cudaStream_t);
-
-// A CUDAPluggableAllocatorDeleterContext object is used as the `ctx`
-// argument for DataPtr. We need context because a user can use
-// multiple allocators in the same PyTorch program, and
-// the allocators can have different free functions, such as:
-// free, cudaFree, cudaFreeAsync, ncclMemFree etc.
-struct TORCH_CUDA_CPP_API CUDAPluggableAllocatorDeleterContext {
-  explicit CUDAPluggableAllocatorDeleterContext(
-      std::function<FreeFuncType> free_fn,
-      void* data,
-      size_t size,
-      int device,
-      cudaStream_t stream);
-
-  void free();
-
- private:
-  std::function<FreeFuncType> free_fn_;
-  void* data_;
-  size_t size_;
-  int device_;
-  cudaStream_t stream_{};
-};
-
 #if defined(USE_ROCM)
 using streamType = c10::hip::HIPStream;
 #else
@@ -49,8 +23,8 @@ getCurrentAllocator();
 TORCH_CUDA_CPP_API std::shared_ptr<
     c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn);
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
 TORCH_CUDA_CPP_API void changeCurrentAllocator(
     const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
         allocator);
@@ -69,8 +43,8 @@ struct _AllocationMetadata {
 struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
     : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
   CUDAPluggableAllocator(
-      std::function<MallocFuncType> alloc_fn,
-      std::function<FreeFuncType> free_fn);
+      std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+      std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
 
   CUDAPluggableAllocator(CUDAPluggableAllocator& other);
   CUDAPluggableAllocator(CUDAPluggableAllocator&& other) = delete;
@@ -173,8 +147,8 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
   void copy_data(void* dest, const void* src, std::size_t count) const final;
 
  protected:
-  std::function<MallocFuncType> alloc_fn_;
-  std::function<FreeFuncType> free_fn_;
+  std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
+  std::function<void(void*, size_t, int, cudaStream_t)> free_fn_;
   std::function<void(int)> init_fn_;
   std::function<void()> reset_fn_;
   std::function<void(double, int)> memory_fraction_fn_;
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 23094f1a06af0..8617b7fff2f11 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -1053,6 +1053,25 @@ static void registerCudaDeviceProperties(PyObject* module) {
       .def_readonly("warp_size", &cudaDeviceProp::warpSize)
 #ifndef USE_ROCM
       // NVIDIA-only properties
+      .def_property_readonly(
+          "clock_rate",
+          [](const cudaDeviceProp&) {
+            int clk = 0;
+            AT_CUDA_CHECK(cudaDeviceGetAttribute(
+                &clk, cudaDevAttrClockRate, c10::cuda::current_device()));
+            return clk;
+          })
+      .def_property_readonly(
+          "memory_clock_rate",
+          [](const cudaDeviceProp&) {
+            int mem_clk = 0;
+            AT_CUDA_CHECK(cudaDeviceGetAttribute(
+                &mem_clk,
+                cudaDevAttrMemoryClockRate,
+                c10::cuda::current_device()));
+            return mem_clk;
+          })
+      .def_readonly("memory_bus_width", &cudaDeviceProp::memoryBusWidth)
       .def_readonly(
           "shared_memory_per_block", &cudaDeviceProp::sharedMemPerBlock)
       .def_readonly(
@@ -1274,14 +1293,16 @@ static void registerCudaPluggableAllocator(PyObject* module) {
             self.set_release_pool(func);
           });
   m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
-    using namespace torch::cuda::CUDAPluggableAllocator;
+    using MallocFuncType = void*(size_t, int, cudaStream_t);
+    using FreeFuncType = void(void*, size_t, int, cudaStream_t);
     std::function<MallocFuncType> malloc_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<MallocFuncType*>(malloc_ptr);
     std::function<FreeFuncType> free_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<FreeFuncType*>(free_ptr);
-    return createCustomAllocator(malloc_fn, free_fn);
+    return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
+        malloc_fn, free_fn);
   });
 
   // NOLINTNEXTLINE(bugprone-unused-raii)
@@ -2165,7 +2186,6 @@ PyMethodDef* THCPModule_methods() {
 }
 
 namespace torch::cuda {
-
 namespace shared {
 
 void initCudartBindings(PyObject* module);
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
index 6c3b3537c523a..3743476c7a52f 100644
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@@ -65,7 +65,7 @@ class DistAccumulateGradCaptureHook
     for (const auto& hook : accumulateGrad_->post_hooks()) {
       (*hook)(kEmptyOutput, inputGrads);
     }
-    return inputGrads[0];
+    return std::move(inputGrads[0]);
   }
 
  private:
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index 7b0fc862e680d..9fbd86cbad709 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -7,10 +7,10 @@
 #include <cstdint>
 
 #ifdef _WIN32
+#include <c10/util/FileSystem.h>
 #include <c10/util/win32-headers.h>
 #include <fileapi.h>
 #include <io.h>
-#include <filesystem>
 #else
 #include <sys/file.h>
 #include <unistd.h>
@@ -161,7 +161,7 @@ class File {
 #ifdef _WIN32
       // if the parent folder doesn't exist it will never be able to create the
       // file so we can skip the retry
-      if (!std::filesystem::exists(std::filesystem::path(path).parent_path())) {
+      if (!c10::filesystem::exists(c10::filesystem::path(path).parent_path())) {
         break;
       }
 #endif
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
index 2384448a06e75..8f8b993ebe6f4 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/FileSystem.h>
 #include <torch/csrc/distributed/c10d/FlightRecorderDetail.hpp>
 #include <fstream>
 
@@ -37,9 +38,9 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
     // Attempt to write to running user's HOME directory cache folder - if it
     // exists.
     auto homeDir = getCvarString({"HOME"}, "/tmp");
-    auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch");
+    auto cacheDirPath = c10::filesystem::path(homeDir + "/.cache/torch");
     // Create the .cache directory if it doesn't exist
-    std::filesystem::create_directories(cacheDirPath);
+    c10::filesystem::create_directories(cacheDirPath);
     auto defaultLocation = cacheDirPath / "comm_lib_trace_rank_";
 
     // For internal bc compatibility, we keep the old the ENV check.
diff --git a/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp b/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp
index 25ac1279d62e9..1673e7a892e57 100644
--- a/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp
@@ -3,7 +3,6 @@
 #include <cuda_runtime.h>
 
 #include <nlohmann/json.hpp>
-#include <filesystem>
 #include <fstream>
 #include <mutex>
 #include <vector>
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 4fb2d566e9a76..5a06a386d5ca8 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -1015,7 +1015,9 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
 
   // Backend classes for this ProcessGroup
   std::unordered_set<c10::DeviceType> deviceTypes_;
-  std::unordered_map<c10::DeviceType, BackendType> deviceTypeToBackendType_;
+  // This mapping is ordered, as splitGroup must call split on the underlying
+  // backends in a consistent order.
+  std::map<c10::DeviceType, BackendType> deviceTypeToBackendType_;
   std::unordered_map<c10::DeviceType, c10::intrusive_ptr<Backend>>
       deviceTypeToBackend_;
   std::unordered_map<BackendType, c10::intrusive_ptr<Backend>>
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index fbd8a403b97dc..8ab924923fe98 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -551,6 +551,32 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
 
 static std::atomic<size_t> process_group_id = 0;
 
+c10::intrusive_ptr<ProcessGroupGloo::Options> ProcessGroupGloo::Options::
+    create_default(std::chrono::milliseconds timeout) {
+  auto options = ::c10d::ProcessGroupGloo::Options::create();
+  bool lazyInit = ::c10d::getDefaultGlooLazyInit();
+
+  // Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
+  auto ifnameEnv = c10::utils::get_env("GLOO_SOCKET_IFNAME");
+  if (ifnameEnv && ifnameEnv->size() > 1) {
+    for (const auto& iface : ::c10d::split(',', ifnameEnv->c_str())) {
+      options->devices.push_back(
+          ::c10d::ProcessGroupGloo::createDeviceForInterface(iface, lazyInit));
+    }
+  } else {
+    // If no hostname is specified, this function looks up
+    // the machine's hostname and returns a device instance
+    // associated with the address that the hostname resolves to.
+    options->devices.push_back(
+        ::c10d::ProcessGroupGloo::createDefaultDevice(lazyInit));
+  }
+
+  options->timeout = timeout;
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  options->threads = options->devices.size() * 2;
+  return options;
+}
+
 ProcessGroupGloo::ProcessGroupGloo(
     const c10::intrusive_ptr<Store>& store,
     int rank,
@@ -710,7 +736,12 @@ c10::intrusive_ptr<Backend> ProcessGroupGloo::split(
   }
 
   auto glooOpts = c10::dynamic_intrusive_pointer_cast<Options>(opts);
-  TORCH_CHECK(glooOpts != nullptr, "opts not a ProcessGroupGloo::Options.");
+  if (glooOpts == nullptr) {
+    TORCH_WARN_ONCE(
+        "Tried to pass options to ProcessGroupGloo::split that are not ProcessGroupGloo::Options."
+        "Falling back to default options.");
+    glooOpts = ProcessGroupGloo::Options::create_default();
+  }
 
   // TODO: we need to get rid of globalRanksInGroup eventually.
   std::vector<uint64_t> globalRanksInGroup;
@@ -729,7 +760,12 @@ c10::intrusive_ptr<Backend> ProcessGroupGloo::merge(
     const int& rank,
     const int& size) {
   auto glooOpts = c10::dynamic_intrusive_pointer_cast<Options>(opts);
-  TORCH_CHECK(glooOpts != nullptr, "opts not a ProcessGroupGloo::Options.");
+  if (glooOpts == nullptr) {
+    TORCH_WARN_ONCE(
+        "Tried to pass options to ProcessGroupGloo::merge that are not ProcessGroupGloo::Options."
+        "Falling back to default options.");
+    glooOpts = ProcessGroupGloo::Options::create_default();
+  }
   auto pg = c10::make_intrusive<ProcessGroupGloo>(
       store->clone(), rank, size, glooOpts);
   return c10::static_intrusive_pointer_cast<Backend>(pg);
@@ -1345,7 +1381,8 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
     // Use single flat output tensor.
     // The first dimension corresponds to the index into outputs[N],
     // so copying into the actual output later is easy.
-    at::Tensor flatOutputTensor = newLikeFlat(outputs[0]);
+    at::Tensor flatOutputTensor =
+        newLikeFlat(outputs[0], /*preserve_strides*/ false);
     GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
     gloo::allgather(opts);
 
@@ -1362,7 +1399,7 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
-    return {newLikeFlat(outputs[0])};
+    return {newLikeFlat(outputs[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -1658,7 +1695,7 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
-    return {newLikeFlat(output_lists[0])};
+    return {newLikeFlat(output_lists[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -1782,7 +1819,7 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
     // This is later scattered to the separate output tensors.
     at::Tensor flatOutputTensor;
     if (context_->rank == root) {
-      flatOutputTensor = newLikeFlat(outputs[0]);
+      flatOutputTensor = newLikeFlat(outputs[0], /*preserve_strides*/ false);
       GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
     }
 
@@ -1805,7 +1842,8 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
 
   const std::vector<at::Tensor> getOutputTensors() override {
     return outputs.empty() ? std::vector<at::Tensor>{}
-                           : std::vector<at::Tensor>{newLikeFlat(outputs[0])};
+                           : std::vector<at::Tensor>{newLikeFlat(
+                                 outputs[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -2021,7 +2059,8 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
 
   const std::vector<at::Tensor> getInputTensors() override {
     return inputs.empty() ? std::vector<at::Tensor>{}
-                          : std::vector<at::Tensor>{newLikeFlat(inputs[0])};
+                          : std::vector<at::Tensor>{newLikeFlat(
+                                inputs[0], /*preserve_strides*/ false)};
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index 4297807f2e8b9..b2cc6993528bf 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -255,6 +255,9 @@ class TORCH_API ProcessGroupGloo : public Backend {
       return c10::make_intrusive<Options>(timeout);
     }
 
+    static c10::intrusive_ptr<Options> create_default(
+        std::chrono::milliseconds timeout = kBackendDefaultTimeout);
+
     std::vector<std::shared_ptr<::gloo::transport::Device>> devices;
     int threads;
   };
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 88782701c6a53..bc9b43d108776 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -2624,7 +2624,7 @@ void ProcessGroupNCCL::runHookLoop() {
         // Hook might grab GIL, unlock first to prevent deadlock
         lock.unlock();
 
-        auto timeFinished = std::chrono::system_clock::now();
+        auto timeFinished = std::chrono::steady_clock::now();
         auto timeStarted =
             timeFinished +
             std::chrono::duration_cast<std::chrono::steady_clock::duration>(
@@ -3217,9 +3217,15 @@ void check_gpu_single_tensor(
   if (!tensor.is_cuda() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be CUDA and dense");
   }
-  // Skip the following requirements for P2P operations
+  // Check memory format
   if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+    // P2P is a bit relaxed, supporting transfer of a transposed tensor
     if (p2p) {
+      // But must be dense still
+      if (!tensor.is_non_overlapping_and_dense()) {
+        C10_THROW_ERROR(
+            ValueError, "Tensors for P2P must be non-overlapping and dense");
+      }
       TORCH_WARN_ONCE(
           "Detected non-contiguous tensor in P2P operations. It is user "
           "responsibility to guarantee that source and destination tensors have "
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index f7a3a28caceb3..e3ac4c09a9b01 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -1201,7 +1201,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Returns the global rank of the device. This function assumes that users
   // always create a default global process group(PG) which includes all
   // devices. It is called in the constructor of ProcessGroupNCCL, so it always
-  // return the rank_ of the the very first PG created, aka, default global PG.
+  // return the rank_ of the very first PG created, aka, default global PG.
   const int& globalRank() const;
 
   const c10::intrusive_ptr<Store>& globalStore() const;
diff --git a/torch/csrc/distributed/c10d/UCCTracing.cpp b/torch/csrc/distributed/c10d/UCCTracing.cpp
index be4f2d3dfffec..66d62d662c259 100644
--- a/torch/csrc/distributed/c10d/UCCTracing.cpp
+++ b/torch/csrc/distributed/c10d/UCCTracing.cpp
@@ -1,5 +1,6 @@
 #ifdef USE_C10D_UCC
 
+#include <c10/util/FileSystem.h>
 #include <c10/util/env.h>
 #include <torch/csrc/distributed/c10d/UCCTracing.hpp>
 #include <torch/csrc/distributed/c10d/UCCUtils.hpp>
@@ -10,7 +11,6 @@
 #include <sys/stat.h>
 #include <cstdlib>
 #include <ctime>
-#include <filesystem>
 #include <fstream>
 
 namespace c10d {
@@ -34,15 +34,15 @@ void ProcessGroupUCCLogger::flushComms(int rank, int world_size) {
         "_", (1 + ltm->tm_mon), "_", ltm->tm_mday, "_", (1900 + ltm->tm_year));
   }
 
-  std::filesystem::path fullpath = std::filesystem::path("/tmp") / dirname;
+  c10::filesystem::path fullpath = c10::filesystem::path("/tmp") / dirname;
   auto user_path = c10::utils::get_env("TORCH_UCC_COMMS_TRACE_OUTPUT_DIR");
   if (user_path.has_value()) {
     fullpath = std::move(user_path.value());
   }
-  std::filesystem::path trace_filename =
+  c10::filesystem::path trace_filename =
       fullpath / fmt::format("rank{}.json", rank);
   std::error_code ec{};
-  if (!std::filesystem::create_directories(fullpath, ec)) {
+  if (!c10::filesystem::create_directories(fullpath, ec)) {
     LOG(INFO) << getLogPrefix() << "[INFO] failed to mkdir " << fullpath
               << " with error " << ec.message();
     return;
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index 03bd6ef3cafd8..c7a2e3523ae4d 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -444,7 +444,9 @@ inline at::Tensor newLikeFlat(
       sizes, strides, t.options().memory_format(std::nullopt));
 }
 
-inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
+inline at::Tensor newLikeFlat(
+    std::vector<at::Tensor>& tensors,
+    bool preserve_strides = true) {
   if (tensors.empty()) {
     TORCH_CHECK(false, "Received an empty list");
   }
@@ -452,7 +454,20 @@ inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
   at::DeviceGuard gpuGuard(t.device());
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return at::empty(sizes, t.options());
+  if (t.is_contiguous() ||
+      !preserve_strides) { // we are checking for memory format, so tensor might
+    // not be contiguous
+    // TODO handle all non-overlapping-and-dense, although if the strides
+    // disagree in ranks we are opening a door for more bugs than currently
+    // where channels-last might disagree between ranks
+    // fast path, don't call empty_strided
+    return at::empty(sizes, t.options());
+  } else {
+    // memory-dense, but not necessarily contiguous tensor
+    std::vector<int64_t> strides{t.numel()};
+    strides.insert(strides.end(), t.strides().begin(), t.strides().end());
+    return at::empty_strided(sizes, strides, t.options());
+  }
 }
 
 inline std::vector<std::vector<int64_t>> getSizes(
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
index 3b743e36d2a05..9e242d6faf9b2 100644
--- a/torch/csrc/distributed/c10d/Work.hpp
+++ b/torch/csrc/distributed/c10d/Work.hpp
@@ -166,8 +166,8 @@ struct TORCH_API WorkInfo {
   WorkInfo(
       const OpType& opType,
       const uint64_t seq,
-      const std::chrono::time_point<std::chrono::system_clock>& timeStarted,
-      const std::chrono::time_point<std::chrono::system_clock>& timeFinished,
+      const std::chrono::time_point<std::chrono::steady_clock>& timeStarted,
+      const std::chrono::time_point<std::chrono::steady_clock>& timeFinished,
       const std::chrono::duration<float>& activeDuration)
       : opType(opType),
         seq(seq),
@@ -177,8 +177,8 @@ struct TORCH_API WorkInfo {
 
   OpType opType;
   uint64_t seq;
-  std::chrono::time_point<std::chrono::system_clock> timeStarted;
-  std::chrono::time_point<std::chrono::system_clock> timeFinished;
+  std::chrono::time_point<std::chrono::steady_clock> timeStarted;
+  std::chrono::time_point<std::chrono::steady_clock> timeFinished;
   std::chrono::duration<float> activeDuration;
 };
 
diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index 1ded910eaad1d..4128bb0dd7a8f 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -109,7 +109,7 @@ at::Tensor parseCppCommHookResult(const c10::IValue& result) {
   if (result.isPyObject()) {
     std::vector<at::Tensor> tensors =
         result.toPyObjectHolder()->extractTensors();
-    return tensors[0];
+    return std::move(tensors[0]);
   }
   TORCH_INTERNAL_ASSERT(
       result.isTensor() || result.isTensorList(),
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index 41dbd7391452f..a9a7722fe41f8 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -1,8 +1,8 @@
-#include <filesystem>
 #include <sstream>
 #include <unordered_map>
 
 #include <ATen/core/interned_strings.h>
+#include <c10/util/FileSystem.h>
 #include <c10/util/thread_name.h>
 #include <caffe2/utils/threadpool/WorkersPool.h>
 #include <torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp>
@@ -145,7 +145,7 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
     // using unix sockets
     server_.set_address_family(AF_UNIX);
 
-    if (std::filesystem::exists(hostOrFile)) {
+    if (c10::filesystem::exists(hostOrFile)) {
       throw std::runtime_error(fmt::format("{} already exists", hostOrFile));
     }
 
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 0189326683585..128fab6593b37 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -46,9 +46,13 @@
 #include <fmt/format.h>
 #include <pybind11/chrono.h>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+
 #include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+#ifdef USE_NVSHMEM
 #include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
+#endif
 
 #include <torch/csrc/distributed/c10d/comm.hpp>
 #include <torch/csrc/distributed/c10d/debug.h>
@@ -3103,8 +3107,6 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def_readwrite("group_name", &::c10d::Backend::Options::group_name);
 
 #ifdef USE_C10D_GLOO
-  static const std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";
-
   auto processGroupGloo =
       intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupGloo>(
           module, "ProcessGroupGloo", backend);
@@ -3181,31 +3183,11 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
             // https://github.com/pybind/pybind11/issues/5473
             py::gil_scoped_release nogil{};
 
-            auto options = ::c10d::ProcessGroupGloo::Options::create();
-            bool lazyInit = ::c10d::getDefaultGlooLazyInit();
-
-            // Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
-            auto ifnameEnv =
-                c10::utils::get_env(GLOO_SOCKET_IFNAME_ENV.c_str());
-            if (ifnameEnv && ifnameEnv->size() > 1) {
-              for (const auto& iface : ::c10d::split(',', ifnameEnv->c_str())) {
-                options->devices.push_back(
-                    ::c10d::ProcessGroupGloo::createDeviceForInterface(
-                        iface, lazyInit));
-              }
-            } else {
-              // If no hostname is specified, this function looks up
-              // the machine's hostname and returns a device instance
-              // associated with the address that the hostname resolves to.
-              options->devices.push_back(
-                  ::c10d::ProcessGroupGloo::createDefaultDevice(lazyInit));
-            }
-
-            options->timeout = timeout;
-            // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-            options->threads = options->devices.size() * 2;
             return c10::make_intrusive<::c10d::ProcessGroupGloo>(
-                store, rank, size, options);
+                store,
+                rank,
+                size,
+                ::c10d::ProcessGroupGloo::Options::create_default(timeout));
           }),
           py::arg("store"),
           py::arg("rank"),
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
index 225304faca652..b705e7099d12c 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
@@ -123,7 +123,7 @@ void IpcChannel::send_fd(int dst_pid, int fd) {
     msg.msg_controllen = 0;
   }
 
-  // Finally send the the message
+  // Finally send the message
   TORCH_CHECK(
       sendmsg(socket_, &msg, 0) > 0,
       "Failed to send fd: ",
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
index f3a63b1c2d11c..69e75df453f51 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -51,7 +51,7 @@ struct NVSHMEMAllocation {
 class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
  public:
   NVSHMEMPeerAllocInfo(
-      std::shared_ptr<NVSHMEMAllocation> allocation,
+      NVSHMEMAllocation* allocation,
       const std::string& group_name)
       : base_ptr_(allocation->ptr),
         buffer_size_(allocation->buffer_size) {
@@ -144,10 +144,9 @@ class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
 class NVSHMEMSymmetricMemory : public SymmetricMemory {
  public:
   NVSHMEMSymmetricMemory(
-      std::shared_ptr<NVSHMEMAllocation> allocation,
+      NVSHMEMAllocation* allocation,
       const std::string& group_name)
-      : allocation_(allocation),
-        device_idx_(allocation->device_idx),
+      : device_idx_(allocation->device_idx),
         group_name_(group_name) {
     // A handle stores two types of info:
     // (i) allocation's base ptrs and base signal pads, ours and peers'
@@ -162,7 +161,7 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
   // Copy with offset is allowed
   // This is mostly a shallow copy that shares the pointer to `NVSHMEMPeerAllocInfo` which has been created by `other`
   NVSHMEMSymmetricMemory(const NVSHMEMSymmetricMemory& other, size_t offset)
-      : allocation_(other.allocation_), device_idx_(other.device_idx_), group_name_(other.group_name_), pai_(other.pai_) {
+      : device_idx_(other.device_idx_), group_name_(other.group_name_), pai_(other.pai_) {
     offset_ = offset;
   }
 
@@ -240,12 +239,11 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
     return pai_->rank_to_global_rank_dev_;
   };
 
-  bool world_within_direct_access() {
+  bool world_within_direct_access() override {
     return pai_->world_within_cuda_p2p_;
   }
 
  private:
-  std::shared_ptr<NVSHMEMAllocation> allocation_;
   int device_idx_;
   std::string group_name_;
   c10::intrusive_ptr<NVSHMEMPeerAllocInfo> pai_;
@@ -335,10 +333,10 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     auto ptr = nvshmem_malloc(size);
     // If size is 0 (which is legal allocation request) we shouldn't error out
     TORCH_CHECK(ptr != nullptr || size == 0, "nvshmem_malloc failed");
-    auto allocation =
-        std::make_shared<NVSHMEMAllocation>(ptr, size, device_idx);
     // TODO: thread safety
-    allocations_.try_emplace(ptr, std::move(allocation));
+    allocations_.try_emplace(
+      ptr,
+      std::make_unique<NVSHMEMAllocation>(ptr, size, device_idx));
     return ptr;
   }
 
@@ -390,7 +388,7 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     } else {
       // Create a new rendezvous
       symm_mem =
-          c10::make_intrusive<NVSHMEMSymmetricMemory>(allocation, *group_name);
+          c10::make_intrusive<NVSHMEMSymmetricMemory>(allocation.get(), *group_name);
     }
 
     // Cache rendezvous using allocation's base address as key
@@ -424,7 +422,7 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
   }
 
  private:
-  std::unordered_map<void*, std::shared_ptr<NVSHMEMAllocation>> allocations_;
+  std::unordered_map<void*, std::unique_ptr<NVSHMEMAllocation>> allocations_;
   std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<NVSHMEMSymmetricMemory>>
       symm_mems_;
 };
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
index 7c97d6cbc9dc8..5fea30a52e64b 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -207,7 +207,7 @@ __global__ void exchangeSplitAndOffset(int64_t* in_out_splits, nvshmem_team_t te
     nvshmem_int64_p(output_splits + mype, input_splits[tid], peer_global);
   }
   // This barrier ensures that all remote PEs see the updated values
-  nvshmemx_barrier_all_block();
+  nvshmemx_barrier_block(team);
 #endif
 }
 
@@ -418,7 +418,7 @@ __global__ void exchangeSplitAndOffset_2d(int64_t* in_splits_offsets, int64_t* o
     nvshmem_int64_p(output_splits + dst_offset, split_val, peer_global);
   }
   // This barrier ensures that all remote PEs see the updated values
-  nvshmemx_barrier_all_block();
+  nvshmemx_barrier_block(team);
 #endif
 }
 
diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
index c8e0ae9c27360..e6cd4a47b7fbc 100644
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@@ -1701,6 +1701,93 @@ class LAMBDA_GUARD : public LeafGuard {
   py::function _guard_check_fn;
 };
 
+/*
+Similar to LAMBDA_GUARD but where lambda does not take any arguments. This
+ensures that we don't need to construct a dictionary from framelocals even if
+the guard is at the root. These guards are for root guards like GlobalState.
+*/
+class LAMBDA_GUARD_NO_ARGS : public LeafGuard {
+ public:
+  LAMBDA_GUARD_NO_ARGS(
+      RootGuardManager* root_guard_manager,
+      py::object guard_check_fn,
+      py::object verbose_code_parts)
+      : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {
+    if (py::isinstance<py::function>(guard_check_fn)) {
+      _guard_check_fn = py::cast<py::function>(std::move(guard_check_fn));
+    } else {
+      throw py::type_error("LAMBDA_GUARD_NO_ARGS expects (callable, str)");
+    }
+  }
+
+  bool _check() {
+    PyObject* x = PyObject_CallNoArgs(_guard_check_fn.ptr()); // new ref
+    if (x == nullptr) {
+      // An exception is caught in the lambda function.
+      PyErr_Clear();
+      return false;
+    }
+    bool result = PyObject_IsTrue(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    return _check();
+  }
+
+  GuardDebugInfo check_verbose_nopybind(PyObject* value) override {
+    PyObject* x = PyObject_CallNoArgs(_guard_check_fn.ptr()); // new ref
+    if (x == nullptr) {
+      // An exception is caught in the lambda function.
+      std::string exc_message = get_exception_message();
+      PyErr_Clear();
+      return GuardDebugInfo(false, exc_message, 0);
+    }
+    bool result = PyObject_IsTrue(x);
+    Py_DECREF(x);
+    if (result) {
+      return GuardDebugInfo(true, 0);
+    }
+    return GuardDebugInfo(false, verbose_code_parts(), 0);
+  }
+
+  // Ensure that framelocals dict is not constructed.
+  bool check_nopybind(FrameLocalsMapping* map) override {
+    return _check();
+  }
+
+ private:
+  // The user provided lambda function for check_fn.
+  py::function _guard_check_fn;
+};
+
+/*
+Similar to LAMBDA_GUARD but disallows running on a FrameLocalsMapping input.
+These guards are at trunk or leaf, and not at the root.
+*/
+class LAMBDA_GUARD_NO_FRAMELOCALS : public LAMBDA_GUARD {
+ public:
+  LAMBDA_GUARD_NO_FRAMELOCALS(
+      RootGuardManager* root_guard_manager,
+      py::object guard_check_fn,
+      py::object verbose_code_parts)
+      : LAMBDA_GUARD(root_guard_manager, guard_check_fn, verbose_code_parts) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    return LAMBDA_GUARD::check_nopybind(value);
+  }
+
+  GuardDebugInfo check_verbose_nopybind(PyObject* value) override {
+    return LAMBDA_GUARD::check_verbose_nopybind(value);
+  }
+
+  bool check_nopybind(FrameLocalsMapping* map) override {
+    throw std::runtime_error(
+        "FramelocalsMapping input to LAMBDA_GUARD_NO_FRAMELOCALS, use LAMBDA_GUARD instead");
+  }
+};
+
 class TYPE_MATCH : public LeafGuard {
  public:
   // type_id = id(type(obj))
@@ -2136,11 +2223,89 @@ class SET_CONTAINS : public LeafGuard {
   py::object _item;
 };
 
+// Check if the float is nan
+class FLOAT_IS_NAN : public LeafGuard {
+ public:
+  FLOAT_IS_NAN(
+      RootGuardManager* root_guard_manager,
+      py::object verbose_code_parts)
+      : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    if (!PyFloat_CheckExact(value)) {
+      return false;
+    }
+    return std::isnan(PyFloat_AsDouble(value));
+  }
+};
+
+// Check if the float is nan
+class COMPLEX_IS_NAN : public LeafGuard {
+ public:
+  COMPLEX_IS_NAN(
+      RootGuardManager* root_guard_manager,
+      py::object verbose_code_parts)
+      : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    if (!PyComplex_CheckExact(value)) {
+      return false;
+    }
+    Py_complex c_value = PyComplex_AsCComplex(value);
+    return std::isnan(c_value.real) || std::isnan(c_value.imag);
+  }
+};
+
+// Check if the dual level is the same as the one in fx graph
+class DUAL_LEVEL_MATCH : public LeafGuard {
+ public:
+  DUAL_LEVEL_MATCH(
+      RootGuardManager* root_guard_manager,
+      int64_t level,
+      py::object verbose_code_parts)
+      : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+        _level(level) {
+    forward_ad_module = py::module_::import("torch.autograd.forward_ad");
+  }
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // Ignore value arg, this is just to satisfy the interface.
+    return _check();
+  }
+
+  bool check_nopybind(FrameLocalsMapping* value) override {
+    // Ignore value arg, this is just to satisfy the interface.
+    return _check();
+  }
+
+  bool _check() {
+    PyObject* current_level = PyObject_GetAttrString(
+        forward_ad_module.ptr(), "_current_level"); // new ref
+    if (current_level == nullptr) {
+      // Attribute absent, clear the exception and return false.
+      PyErr_Clear();
+      return false;
+    }
+    if (!PyLong_CheckExact(current_level)) {
+      Py_DECREF(current_level);
+      return false;
+    } else {
+      int64_t current_level_int = PyLong_AsLongLong(current_level);
+      Py_DECREF(current_level);
+      return current_level_int == _level;
+    }
+  }
+
+ private:
+  int64_t _level;
+  py::object forward_ad_module;
+};
+
 /**
  * Relational guards compare more than one value. We implement Relational
  * guards by capturing some state in the guard object. For example for tensor
  * aliasing guards - tensor X is not tensor Y - we construct one leaf guard
- * and and install it at as a leaf of two guard managers (one for X and
+ * and install it at as a leaf of two guard managers (one for X and
  * another for Y). Therefore, this guard is run twice. In the first
  * invocation, it saves the first value (state) and returns True. In the
  * second invocation, it compares the saved value with the new value and
@@ -5556,6 +5721,11 @@ class GlobalsGuardAccessor : public GuardAccessor {
     return "GlobalsGuardAccessor";
   }
 
+  bool check_nopybind(FrameLocalsMapping* map, bool matches_dict_tag) override {
+    // Ensure that we don't construct the framelocals to dict here.
+    return _guard_manager->check_nopybind(_globals_dict);
+  }
+
  public: // cloning functions
   GlobalsGuardAccessor(GuardManager* guard_manager, GlobalsGuardAccessor* from)
       : GuardAccessor(guard_manager, from) {
@@ -6605,6 +6775,19 @@ PyObject* torch_c_dynamo_guards_init() {
       py_m, "LAMBDA_GUARD")
       .def(py::init<RootGuardManager*, py::function, py::list>())
       .def("__call__", &LAMBDA_GUARD::check);
+  py::class_<
+      LAMBDA_GUARD_NO_ARGS,
+      LeafGuard,
+      std::shared_ptr<LAMBDA_GUARD_NO_ARGS>>(py_m, "LAMBDA_GUARD_NO_ARGS")
+      .def(py::init<RootGuardManager*, py::function, py::list>())
+      .def("__call__", &LAMBDA_GUARD_NO_ARGS::check);
+  py::class_<
+      LAMBDA_GUARD_NO_FRAMELOCALS,
+      LeafGuard,
+      std::shared_ptr<LAMBDA_GUARD_NO_FRAMELOCALS>>(
+      py_m, "LAMBDA_GUARD_NO_FRAMELOCALS")
+      .def(py::init<RootGuardManager*, py::function, py::list>())
+      .def("__call__", &LAMBDA_GUARD_NO_FRAMELOCALS::check);
   py::class_<TYPE_MATCH, LeafGuard, std::shared_ptr<TYPE_MATCH>>(
       py_m, "TYPE_MATCH")
       .def(py::init<RootGuardManager*, py::object, py::list>())
@@ -6691,6 +6874,18 @@ PyObject* torch_c_dynamo_guards_init() {
       py_m, "SET_CONTAINS")
       .def(py::init<RootGuardManager*, bool, py::object, py::list>())
       .def("__call__", &SET_CONTAINS::check);
+  py::class_<DUAL_LEVEL_MATCH, LeafGuard, std::shared_ptr<DUAL_LEVEL_MATCH>>(
+      py_m, "DUAL_LEVEL_MATCH")
+      .def(py::init<RootGuardManager*, int64_t, py::list>())
+      .def("__call__", &DUAL_LEVEL_MATCH::check);
+  py::class_<FLOAT_IS_NAN, LeafGuard, std::shared_ptr<FLOAT_IS_NAN>>(
+      py_m, "FLOAT_IS_NAN")
+      .def(py::init<RootGuardManager*, py::list>())
+      .def("__call__", &FLOAT_IS_NAN::check);
+  py::class_<COMPLEX_IS_NAN, LeafGuard, std::shared_ptr<COMPLEX_IS_NAN>>(
+      py_m, "COMPLEX_IS_NAN")
+      .def(py::init<RootGuardManager*, py::list>())
+      .def("__call__", &COMPLEX_IS_NAN::check);
   py::class_<DYNAMIC_INDICES, LeafGuard, std::shared_ptr<DYNAMIC_INDICES>>(
       py_m, "DYNAMIC_INDICES")
       .def(py::init<RootGuardManager*, py::set, py::list>())
@@ -6907,6 +7102,26 @@ PyObject* torch_c_dynamo_guards_init() {
                 std::move(lambda),
                 std::move(verbose_code_parts)));
           })
+      .def(
+          "add_lambda_guard_no_args",
+          [](GuardManager& self,
+             py::object lambda,
+             py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<LAMBDA_GUARD_NO_ARGS>(
+                self.get_root(),
+                std::move(lambda),
+                std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_lambda_guard_no_framelocals",
+          [](GuardManager& self,
+             py::object lambda,
+             py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<LAMBDA_GUARD_NO_FRAMELOCALS>(
+                self.get_root(),
+                std::move(lambda),
+                std::move(verbose_code_parts)));
+          })
       .def(
           "add_type_match_guard",
           [](GuardManager& self,
@@ -7100,6 +7315,26 @@ PyObject* torch_c_dynamo_guards_init() {
                 std::move(item),
                 std::move(verbose_code_parts)));
           })
+      .def(
+          "add_dual_level_match_guard",
+          [](GuardManager& self,
+             int64_t level,
+             py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<DUAL_LEVEL_MATCH>(
+                self.get_root(), level, std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_float_is_nan_guard",
+          [](GuardManager& self, py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<FLOAT_IS_NAN>(
+                self.get_root(), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_complex_is_nan_guard",
+          [](GuardManager& self, py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<COMPLEX_IS_NAN>(
+                self.get_root(), std::move(verbose_code_parts)));
+          })
       .def(
           "add_dynamic_indices_guard",
           [](GuardManager& self,
diff --git a/torch/csrc/export/pt2_archive_constants.h b/torch/csrc/export/pt2_archive_constants.h
index 1583f759acb65..8e4e2653265e3 100644
--- a/torch/csrc/export/pt2_archive_constants.h
+++ b/torch/csrc/export/pt2_archive_constants.h
@@ -47,6 +47,8 @@ namespace torch::_export::archive_spec {
   DO(SAMPLE_INPUTS_DIR, "data/sample_inputs/")                                 \
   DO(SAMPLE_INPUTS_FILENAME_FORMAT,                                            \
      "data/sample_inputs/{}.pt") /* {model_name} */                            \
+  /* ExecuTorch artifacts, including PTE files */                              \
+  DO(EXECUTORCH_DIR, "data/executorch/")                                       \
   /* extra folder */                                                           \
   DO(EXTRA_DIR, "extra/")                                                      \
   DO(MODULE_INFO_PATH, "extra/module_info.json")                               \
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index 3ad53c3f403f6..32e781ce43056 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -363,6 +363,13 @@ static int64_t maybe_get_level(const Tensor& tensor) {
   return -1;
 }
 
+static void maybe_unsafe_set_level(const Tensor& tensor, int64_t level) {
+  auto* batched = maybeGetBatchedImpl(tensor);
+  if (batched) {
+    return batched->_unsafe_set_level(level);
+  }
+}
+
 static int64_t maybe_get_bdim(const Tensor& tensor) {
   auto* batched = maybeGetBatchedImpl(tensor);
   if (batched) {
@@ -519,6 +526,7 @@ void initFuncTorchBindings(PyObject* module) {
   m.def("is_functionaltensor", &is_functionaltensor);
   m.def("get_unwrapped", &get_unwrapped);
   m.def("maybe_get_level", &maybe_get_level);
+  m.def("_maybe_unsafe_set_level", &maybe_unsafe_set_level);
   m.def("maybe_get_bdim", &maybe_get_bdim);
   m.def("maybe_current_level", &maybe_current_level);
   m.def("current_level", &currentLevel);
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index aa8ef905d57aa..e94d39a5e540e 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -24,6 +24,7 @@ namespace fs = std::filesystem;
 
 // TODO: C++17 has the filesystem header, which may replace these
 #ifdef _WIN32
+#include <Windows.h>
 // On Windows, the POSIX implementations are considered deprecated. We simply
 // map to the newer variant.
 #include <direct.h>
@@ -40,6 +41,23 @@ namespace {
 
 const std::string k_separator = "/";
 
+std::string remove_duplicate_separator_of_path(const std::string& path) {
+  /*
+  On Windows, temp file path maybe has duplicate separator.
+  Need to remove the duplication:
+  Origin: C:/Users/Xuhan/AppData/Local/Temp//tmpl10jfwef/filename
+  Processed: C:/Users/Xuhan/AppData/Local/Temp/tmpl10jfwef/filename
+  */
+  std::string result = path;
+  size_t pos = 0;
+
+  while ((pos = result.find("//", pos)) != std::string::npos) {
+    result.replace(pos, 2, "/");
+  }
+
+  return result;
+}
+
 std::string normalize_path_separator(const std::string& orig_path) {
   /*
   On Windows and Linux have different separator:
@@ -57,6 +75,7 @@ std::string normalize_path_separator(const std::string& orig_path) {
 #ifdef _WIN32
   std::replace(normalized_path.begin(), normalized_path.end(), '\\', '/');
 #endif
+  normalized_path = remove_duplicate_separator_of_path(normalized_path);
   return normalized_path;
 }
 
@@ -108,6 +127,22 @@ const char* extension_file_ext() {
 #endif
 }
 
+const char* get_output_flags(bool compile_only) {
+  if (compile_only) {
+#ifdef _WIN32
+    return "/c /Fo"; // codespell:ignore
+#else
+    return "-c -o";
+#endif
+  }
+
+#ifdef _WIN32
+  return "/Fe";
+#else
+  return "-o";
+#endif
+}
+
 bool _is_windows_os() {
 #ifdef _WIN32
   return true;
@@ -146,7 +181,7 @@ std::tuple<std::string, std::string> get_cpp_compile_command(
 
   std::string source_args;
   for (const std::string& source : sources) {
-    source_args += source + " ";
+    source_args += normalize_path_separator(source) + " ";
   }
 
   std::string file_ext =
@@ -160,24 +195,28 @@ std::tuple<std::string, std::string> get_cpp_compile_command(
 
   std::string cflags_args;
   for (auto& arg : compile_options["cflags"]) {
-    cflags_args += _is_windows_os() ? "/" : "-" + arg.get<std::string>() + " ";
+    // [Windows compiler need it] convert first char arg to std::string, for
+    // following plus(+) strings.
+    cflags_args += std::string(_is_windows_os() ? "/" : "-") +
+        arg.get<std::string>() + " ";
   }
 
   std::string definitions_args;
   for (auto& arg : compile_options["definitions"]) {
-    definitions_args +=
-        _is_windows_os() ? "/D" : "-D " + arg.get<std::string>() + " ";
+    definitions_args += std::string(_is_windows_os() ? "/D" : "-D ") +
+        arg.get<std::string>() + " ";
   }
 
   std::string include_dirs_args;
   for (auto& arg : compile_options["include_dirs"]) {
-    include_dirs_args +=
-        _is_windows_os() ? "/I" : "-I" + arg.get<std::string>() + " ";
+    include_dirs_args += std::string(_is_windows_os() ? "/I" : "-I") +
+        arg.get<std::string>() + " ";
   }
 
   std::string ldflags_args;
   for (auto& arg : compile_options["ldflags"]) {
-    ldflags_args += _is_windows_os() ? "/" : "-" + arg.get<std::string>() + " ";
+    ldflags_args += std::string(_is_windows_os() ? "/" : "-") +
+        arg.get<std::string>() + " ";
   }
 
   std::string libraries_dirs_args;
@@ -209,38 +248,48 @@ std::tuple<std::string, std::string> get_cpp_compile_command(
     passthrough_parameters_args += arg_str + " ";
   }
 
-  std::string compile_only_arg =
-      compile_only ? (_is_windows_os() ? "/c" : "-c") : "";
+  std::string output_flags = get_output_flags(compile_only);
 
   std::string cmd;
+  /*
+  Format command as python frontend cpp_builder:
+  https://github.com/pytorch/pytorch/blob/3ef1bef36c73b4def0e1b71847e27fde1556c0fb/torch/_inductor/cpp_builder.py#L1780-L1790
+  https://github.com/pytorch/pytorch/blob/3ef1bef36c73b4def0e1b71847e27fde1556c0fb/torch/_inductor/cpp_builder.py#L1959-L1976
+  */
   if (_is_windows_os()) {
-    cmd = normalize_path_separator(fmt::format(
-        "{} {} {} {} {} {} /LD /Fe{} {} /link {} {} {}",
+    cmd = fmt::format(
+        "{} {} {} {} {} {} {}{}",
         compiler,
         include_dirs_args,
         definitions_args,
         cflags_args,
         source_args,
         passthrough_parameters_args,
-        target_file,
-        compile_only_arg,
-        libraries_dirs_args,
-        libraries_args,
-        ldflags_args));
+        output_flags,
+        target_file);
+    if (compile_only == false) {
+      cmd += fmt::format(
+          " /LD /link {} {} {}",
+          libraries_dirs_args,
+          libraries_args,
+          ldflags_args);
+    }
+    cmd = normalize_path_separator(cmd);
   } else {
-    cmd = normalize_path_separator(fmt::format(
-        "{} {} {} {} {} {} {} {} {} {} -o {}",
+    cmd = fmt::format(
+        "{} {} {} {} {} {} {} {}",
         compiler,
         source_args,
         definitions_args,
         cflags_args,
         include_dirs_args,
         passthrough_parameters_args,
-        ldflags_args,
-        libraries_args,
-        libraries_dirs_args,
-        compile_only_arg,
-        target_file));
+        output_flags,
+        target_file);
+    if (compile_only == false) {
+      cmd += fmt::format(
+          " {} {} {}", ldflags_args, libraries_args, libraries_dirs_args);
+    }
   }
 
   return std::make_tuple(cmd, target_file);
@@ -350,14 +399,15 @@ std::string compile_so(
   size_t lastindex = cpp_filename.find_last_of('.');
   std::string filename = cpp_filename.substr(0, lastindex);
 
-  std::string compile_flags_path = filename + "_compile_flags.json";
+  std::string compile_flags_path =
+      normalize_path_separator(filename + "_compile_flags.json");
   const nlohmann::json compile_flags = load_json_file(compile_flags_path);
 
   auto [compile_cmd, output_o] =
       get_cpp_compile_command(filename, {cpp_filename}, compile_flags);
 
-  std::string linker_flags_path =
-      cpp_filename.substr(0, lastindex) + "_linker_flags.json";
+  std::string linker_flags_path = normalize_path_separator(
+      cpp_filename.substr(0, lastindex) + "_linker_flags.json");
   const nlohmann::json linker_flags = load_json_file(linker_flags_path);
 
   obj_filenames.push_back(output_o);
@@ -492,12 +542,28 @@ class RAIIMinizArchive {
   void extract_file(
       const std::string& zip_filename,
       const std::string& dest_filename) {
+    // Can't normalize_path_separator zip_filename, as it is zip index.
+    std::string path_dest_filename = normalize_path_separator(dest_filename);
     if (!mz_zip_reader_extract_file_to_file(
-            &_zip_archive, zip_filename.c_str(), dest_filename.c_str(), 0)) {
+            &_zip_archive,
+            zip_filename.c_str(),
+            path_dest_filename.c_str(),
+            0)) {
+#ifdef _WIN32
+      DWORD dwErrCode = GetLastError();
       throw std::runtime_error(fmt::format(
-          "Failed to extract zip file {} to destination file {}",
+          "Failed to extract zip file {} to destination file {}, error code: {}, mz_zip error string: {}",
           zip_filename,
-          dest_filename));
+          path_dest_filename,
+          dwErrCode,
+          mz_zip_get_error_string(mz_zip_get_last_error(&_zip_archive))));
+#else
+      throw std::runtime_error(fmt::format(
+          "Failed to extract zip file {} to destination file {}, mz_zip error string: {}",
+          zip_filename,
+          path_dest_filename,
+          mz_zip_get_error_string(mz_zip_get_last_error(&_zip_archive))));
+#endif
     }
   }
 
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
index 8bb23b16b7898..6a0cfc1974d52 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
@@ -5,23 +5,7 @@
 #include <torch/csrc/inductor/aoti_torch/oss_proxy_executor.h>
 #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
 
-#ifndef _WIN32
-#include <sys/stat.h>
-#else
-#include <filesystem>
-namespace fs = std::filesystem;
-#endif
-
-namespace {
-bool file_exists(std::string& path) {
-#ifdef _WIN32
-  return fs::exists(path);
-#else
-  struct stat rc{};
-  return lstat(path.c_str(), &rc) == 0;
-#endif
-}
-} // namespace
+#include <c10/util/FileSystem.h>
 
 namespace torch::inductor {
 
@@ -110,7 +94,7 @@ consider rebuild your model with the latest AOTInductor.");
   size_t lastindex = model_so_path.find_last_of('.');
   std::string json_filename = model_so_path.substr(0, lastindex) + ".json";
 
-  if (file_exists(json_filename)) {
+  if (c10::filesystem::exists(json_filename)) {
     proxy_executor_ = std::make_unique<torch::aot_inductor::OSSProxyExecutor>(
         json_filename, device_str == "cpu");
     proxy_executor_handle_ =
diff --git a/torch/csrc/inductor/aoti_runtime/model_base.h b/torch/csrc/inductor/aoti_runtime/model_base.h
index 589c341343454..bccf25360bc4c 100644
--- a/torch/csrc/inductor/aoti_runtime/model_base.h
+++ b/torch/csrc/inductor/aoti_runtime/model_base.h
@@ -325,10 +325,23 @@ using RAIIDataPtr = std::unique_ptr<void, std::function<void(void*)>>;
 
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
 RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
+#ifdef AOT_INDUCTOR_USE_CACHING_ALLOCATOR
+  // Use caching allocator for allocating GPU memory
+  void* data_ptr = nullptr;
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_cuda_caching_allocator_raw_alloc(num_bytes, &data_ptr));
+  auto deleter = [](void* ptr) {
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_cuda_caching_allocator_raw_delete(ptr));
+  };
+  return RAIIDataPtr(data_ptr, deleter);
+#else
+  // Use cudaMalloc directly for allocating GPU memory
   void* data_ptr = nullptr;
   AOTI_RUNTIME_CUDA_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
   auto deleter = [](void* ptr) { AOTI_RUNTIME_CUDA_CHECK(cudaFree(ptr)); };
   return RAIIDataPtr(data_ptr, deleter);
+#endif
 }
 
 #elif defined(USE_XPU)
diff --git a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
index 9745f69ccf4f1..3a2e91c37c916 100644
--- a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
+++ b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
@@ -128,12 +128,10 @@ static std::unique_ptr<sycl::kernel> _createKernel(
     uint32_t numWarps,
     uint32_t sharedMemory,
     void** params,
-    sycl::queue* queuePtr) {
+    sycl::queue* queuePtr,
+    uint32_t threadsPerWarp) {
   std::string kernelName =
       kernelPtr->get_info<sycl::info::kernel::function_name>();
-  // Currently threadsPerWarp is hard code to 32 from torch.compile to triton
-  // stack.
-  int threadsPerWarp = 32;
   uint32_t numParams = kernelPtr->get_info<sycl::info::kernel::num_args>();
   size_t globalRangeX = gridX * threadsPerWarp * numWarps;
   size_t globalRangeY = gridY;
diff --git a/torch/csrc/inductor/aoti_runtime/utils.h b/torch/csrc/inductor/aoti_runtime/utils.h
index b813b3f6f745c..4eee0b09a83a1 100644
--- a/torch/csrc/inductor/aoti_runtime/utils.h
+++ b/torch/csrc/inductor/aoti_runtime/utils.h
@@ -52,6 +52,11 @@ inline void delete_tensor_object(void* ptr) {
       aoti_torch_delete_tensor_object(reinterpret_cast<AtenTensorHandle>(ptr)));
 }
 
+inline void delete_c10_value_object(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_c10_value_object(
+      reinterpret_cast<C10IValueHandle>(ptr)));
+}
+
 class RAIIAtenRecordFunctionHandle {
  public:
   RAIIAtenRecordFunctionHandle() : handle_(nullptr, noop_deleter) {}
@@ -196,6 +201,47 @@ class RAIIAtenTensorHandle {
   std::unique_ptr<AtenTensorOpaque, DeleterFnPtr> handle_;
 };
 
+// RAIIC10IValueHandle steals the IValue objects created by the libtorch C ABI
+class RAIIC10IValueHandle {
+ public:
+  RAIIC10IValueHandle() : handle_(nullptr, noop_deleter) {}
+  RAIIC10IValueHandle(const RAIIC10IValueHandle& other) = delete;
+  RAIIC10IValueHandle& operator=(const RAIIC10IValueHandle& other) = delete;
+
+  // Steal the ownership from another RAIIC10IValueHandle using std::move
+  RAIIC10IValueHandle(RAIIC10IValueHandle&& other) = default;
+  RAIIC10IValueHandle& operator=(RAIIC10IValueHandle&& other) = default;
+
+  // Steal the ownership from raw C10IValueHandle
+  RAIIC10IValueHandle(C10IValueHandle handle)
+      : handle_(handle, delete_c10_value_object) {}
+
+  ~RAIIC10IValueHandle() {
+    handle_.reset();
+  }
+
+  // Return a raw C10IValueHandle to be used by aoti_torch functions
+  // Note: this function does NOT transfer the ownership of the handle
+  operator C10IValueHandle() const {
+    return handle_.get();
+  }
+
+  C10IValueHandle release() {
+    return handle_.release();
+  }
+
+  C10IValueHandle get() const {
+    return handle_.get();
+  }
+
+  void reset() {
+    handle_.reset();
+  }
+
+ private:
+  std::unique_ptr<C10IValueOpaque, DeleterFnPtr> handle_;
+};
+
 class MaybeOwningAtenTensorHandle {
  public:
   MaybeOwningAtenTensorHandle() : handle_(nullptr), raii_handle_() {}
diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
index 3ce4dd82cfdab..3c110ba60f62f 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -176,6 +176,14 @@ AOTI_TORCH_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_delete_tensor_object(AtenTensorHandle tensor);
 
+// c10::IValue <int64_t> object conversion
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_int64_to_ivalue(int64_t val, C10IValueHandle* ivalue);
+
+// Free the c10::IValue object
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_c10_value_object(C10IValueHandle handle);
+
 // Get a pointer to the underlying storage data
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr(
     AtenTensorHandle tensor,
@@ -579,6 +587,15 @@ aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
 
+// CUDA memory allocation using CUDACachingAllocator
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_caching_allocator_raw_alloc(
+    uint64_t nbytes,
+    void** ret_ptr // returns raw GPU memory pointer
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cuda_caching_allocator_raw_delete(void* ptr);
+
 #endif // USE_CUDA
 
 // See `ProxyExecutor Design Note` in ir.py for more details
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
index 179c0074b3cdf..74c4c111ca519 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
@@ -15,6 +15,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__adaptive_avg_pool2d(AtenTensorH
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
index 09ebbb76d0b21..39f0dec86165a 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
@@ -19,6 +19,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attent
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__weight_int4pack_mm_with_scales_and_zeros(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScale, AtenTensorHandle qZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
index 0beffa32d6c91..df81b34b7d849 100644
--- a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
+++ b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
@@ -4,6 +4,7 @@
 #include <vector>
 
 #include <c10/util/Exception.h>
+#include <c10/util/FileSystem.h>
 #include <torch/csrc/inductor/aoti_torch/oss_proxy_executor.h>
 #include <torch/csrc/jit/serialization/pickle.h>
 
@@ -18,12 +19,6 @@ bool has_key(
   return map.find(key) != map.end();
 }
 
-#ifdef _WIN32
-const std::string k_separator = "\\";
-#else
-const std::string k_separator = "/";
-#endif
-
 } // namespace
 
 namespace torch::aot_inductor {
@@ -624,10 +619,8 @@ OSSProxyExecutor::OSSProxyExecutor(
     // Load custom objects from custom_objs_config.json file
     // Get the constants json path from the extern_kernel_nodes .json file
 
-    size_t lastSlash = json_path.find_last_of("/\\");
-    std::string folder_path = json_path.substr(0, lastSlash);
-    std::string custom_objs_json_path =
-        folder_path + k_separator + "custom_objs_config.json";
+    auto folder_path = c10::filesystem::path(json_path).parent_path();
+    auto custom_objs_json_path = folder_path / "custom_objs_config.json";
     LOG(INFO) << "Loading custom_objs_config .json file from "
               << custom_objs_json_path;
 
@@ -642,8 +635,7 @@ OSSProxyExecutor::OSSProxyExecutor(
       custom_objs_json_file >> custom_objs_json;
       // Load custom objects from binary torchbind file
       for (auto& [customObjName, file_name] : custom_objs_json.items()) {
-        std::string customObjPath =
-            folder_path + k_separator + file_name.get<std::string>();
+        auto customObjPath = folder_path / file_name.get<std::string>();
         LOG(INFO) << "Loading custom object to FbProxyExecutor from: "
                   << customObjPath;
 
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 2cdeab071cd82..96650c1de8d4a 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -28,6 +28,7 @@
 #include <c10/core/Device.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Stream.h>
+#include <c10/util/FileSystem.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -57,57 +58,7 @@
 #include <ATen/ops/scatter_reduce.h>
 #include <ATen/ops/view_as_real_ops.h>
 #include <ATen/ops/view_ops.h>
-
-#endif
-
-#ifndef _WIN32
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <climits>
-
-#else
-#include <filesystem>
-namespace fs = std::filesystem;
-#endif
-
-// HACK for failed builds in ARVR, where it cannot find these symbols within
-// std::experimental::filesystem
-namespace {
-std::string get_current_path() {
-#ifdef _WIN32
-  return fs::current_path().string();
-#else
-  // NOLINTNEXTLINE(*array*)
-  char currentPath[PATH_MAX]{};
-  if (getcwd(currentPath, sizeof(currentPath)) != nullptr) {
-    return std::string(currentPath);
-  } else {
-    throw std::runtime_error("Failed to get current path");
-  }
-#endif
-}
-
-bool file_exists(std::string& path) {
-#ifdef _WIN32
-  return fs::exists(path);
-#else
-  struct stat rc{};
-  return lstat(path.c_str(), &rc) == 0;
 #endif
-}
-
-bool create_directories(const std::string& path) {
-#ifdef _WIN32
-  return fs::create_directories(path);
-#else
-  if (mkdir(path.c_str(), 0777) == -1) {
-    throw std::runtime_error("Failed to create directory");
-  }
-  return true;
-#endif
-}
-} // namespace
 
 using namespace torch::aot_inductor;
 
@@ -270,6 +221,22 @@ AOTITorchError aoti_torch_delete_tensor_object(AtenTensorHandle tensor) {
   });
 }
 
+AOTITorchError aoti_torch_delete_c10_value_object(C10IValueHandle handle) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::IValue* t = reinterpret_cast<c10::IValue*>(handle);
+    delete t;
+  });
+}
+
+AOTITorchError aoti_torch_int64_to_ivalue(
+    int64_t val,
+    C10IValueHandle* ivalue) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::IValue* t = new c10::IValue(val);
+    *ivalue = reinterpret_cast<C10IValueHandle>(t);
+  });
+}
+
 AOTITorchError aoti_torch_get_data_ptr(
     AtenTensorHandle tensor,
     void** ret_data_ptr) {
@@ -1232,21 +1199,22 @@ void aoti_torch_save_tensor_handle(
   at::Tensor* t = tensor_handle_to_tensor_pointer(self);
 #ifndef C10_MOBILE
   // Save tensor to tmp .pt file for tensors and can be torch.load'ed later
-  std::string cwd = get_current_path();
-  std::string tmp_folder = cwd + "/tmp/aoti_torch/";
-  if (!file_exists(tmp_folder)) {
+  auto cwd = c10::filesystem::current_path();
+  auto tmp_folder = cwd / "tmp" / "aoti_torch";
+  if (!c10::filesystem::exists(tmp_folder)) {
     std::cout
         << "aoti_torch_save_tensor_handle: Path does not exist, creating it..."
         << tmp_folder << '\n';
 
-    if (!create_directories(tmp_folder)) {
+    std::error_code ec{};
+    if (!c10::filesystem::create_directories(tmp_folder, ec)) {
       std::cout << "aoti_torch_save_tensor_handle: Error creating directory: "
-                << tmp_folder << '\n';
+                << tmp_folder << " error:" << ec.message() << '\n';
       return;
     }
   }
-  std::string tensor_filepath_to_save = tmp_folder + launch_prefix + "_" +
-      kernel_name + "_" + tensor_name + "_" + t->device().str() + ".pt";
+  std::string tensor_filepath_to_save = tmp_folder.string() + launch_prefix +
+      "_" + kernel_name + "_" + tensor_name + "_" + t->device().str() + ".pt";
 
   auto bytes = torch::jit::pickle_save(c10::IValue(*t));
   std::ofstream fout(tensor_filepath_to_save, std::ios::out | std::ios::binary);
diff --git a/torch/csrc/inductor/aoti_torch/shim_cuda.cpp b/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
index b09280745f1ee..305e7e0de6a99 100644
--- a/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 
@@ -53,3 +54,32 @@ AOTITorchError aoti_torch_get_current_cuda_stream(
     *(cudaStream_t*)(ret_stream) = at::cuda::getCurrentCUDAStream(device_index);
   });
 }
+
+AOTITorchError aoti_torch_cuda_caching_allocator_raw_alloc(
+    uint64_t nbytes,
+    void** ret_ptr) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    if (nbytes == 0) {
+      *ret_ptr = nullptr;
+      return AOTI_TORCH_SUCCESS;
+    }
+
+    *ret_ptr = c10::cuda::CUDACachingAllocator::raw_alloc(nbytes);
+
+    if (*ret_ptr == nullptr) {
+      TORCH_CHECK(
+          false,
+          "Failed to allocate ",
+          nbytes,
+          " bytes from CUDA caching allocator");
+    }
+  });
+}
+
+AOTITorchError aoti_torch_cuda_caching_allocator_raw_delete(void* ptr) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    if (ptr != nullptr) {
+      c10::cuda::CUDACachingAllocator::raw_delete(ptr);
+    }
+  });
+}
diff --git a/torch/csrc/inductor/static_cuda_launcher.cpp b/torch/csrc/inductor/static_cuda_launcher.cpp
index 35756b704faa9..59916b6763bfa 100644
--- a/torch/csrc/inductor/static_cuda_launcher.cpp
+++ b/torch/csrc/inductor/static_cuda_launcher.cpp
@@ -369,7 +369,7 @@ PyObject* launch_kernel(PyObject* self, PyObject* args) {
   // Parse the fixed arguments and the format string
   if (!PyArg_ParseTuple(
           args,
-          "KiiiiisOl",
+          "KiiiiisOK",
           &func_ptr,
           &gridX,
           &gridY,
diff --git a/torch/csrc/jit/codegen/fuser/README.md b/torch/csrc/jit/codegen/fuser/README.md
index e115f999188e6..3fdc6f371a5c9 100644
--- a/torch/csrc/jit/codegen/fuser/README.md
+++ b/torch/csrc/jit/codegen/fuser/README.md
@@ -7,7 +7,7 @@ The fuser accepts subgraphs wrapped in "fusion nodes" and tries to execute them
 The fuser is designed hierarchically with device-independent logic eventually deferring to device-specific logic and implementation. The device-specific code is (mostly) found in each devices' subdirectory. The device-independent logic has six components:
 
 * The Interface (interface.h/cpp) has functions to register and run fusions, interrogate fusion functionality, and perform debugging.
-* The Compiler (compiler.h/cpp) performs "upfront" and "runtime" compilation. When fusions are registered, upfront compilation produces fallback code and and performs some shape inference. When a fusion is run, runtime compilation invokes code generation and the device-specific compilation logic.
+* The Compiler (compiler.h/cpp) performs "upfront" and "runtime" compilation. When fusions are registered, upfront compilation produces fallback code and performs some shape inference. When a fusion is run, runtime compilation invokes code generation and the device-specific compilation logic.
 * The Code Generator (codegen.h/cpp) produces the string to be compiled on the device.
 * The Executor (executor.h/cpp) runs requested fusions. It performs shape inference, expands tensors as necessary, determines the device to run on, acquires a cached compiled kernel or requests the Compiler produce a new one, invokes device-specific code to launch the kernel and updates the stack.
 * The Fallback (fallback.h/cpp) runs subgraphs that can't be fused because shape inference didn't determine a common tensor size or the device the tensors are on doesn't support fusion.
diff --git a/torch/csrc/jit/codegen/onednn/README.md b/torch/csrc/jit/codegen/onednn/README.md
index fb309abc3bc0e..d8b81bbb79818 100644
--- a/torch/csrc/jit/codegen/onednn/README.md
+++ b/torch/csrc/jit/codegen/onednn/README.md
@@ -81,7 +81,7 @@ cmake/Modules/FindMKLDNN.cmake
 cmake/Dependencies.cmake
 ```
 
-To map another op to oneDNN Graph, you should add an entry for it in in createOperator in torch/csrc/jit/codegen/onednn/graph_helper.cpp.
+To map another op to oneDNN Graph, you should add an entry for it in createOperator in torch/csrc/jit/codegen/onednn/graph_helper.cpp.
 If it has an inplace variant, you should add it in the lambda being passed to RemoveTensorMutation in
 torch/csrc/jit/codegen/onednn/interface.cpp. You might also want to add it to canFuseNode in torch/csrc/jit/codegen/onednn/register_interface.cpp.
 
diff --git a/torch/csrc/jit/mobile/file_format.h b/torch/csrc/jit/mobile/file_format.h
index 2156f8695a63c..814d680f83ba7 100644
--- a/torch/csrc/jit/mobile/file_format.h
+++ b/torch/csrc/jit/mobile/file_format.h
@@ -153,7 +153,8 @@ static inline std::tuple<std::shared_ptr<char>, size_t> get_file_content(
   size_t buffer_size = (size / kMaxAlignment + 1) * kMaxAlignment;
   std::shared_ptr<char> data(
       static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
-  fread(data.get(), size, 1, f);
+  auto nread = fread(data.get(), size, 1, f);
+  TORCH_CHECK(nread == 1, "Failed to read file: ", filename);
   fclose(f);
 #endif
   return std::make_tuple(data, size);
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index a784ba72e6550..32d8f46bfcf65 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -78,6 +78,7 @@
 #include <torch/csrc/jit/passes/vulkan_rewrite.h>
 #include <torch/csrc/jit/passes/xnnpack_rewrite.h>
 #include <torch/csrc/jit/python/init.h>
+#include <torch/csrc/jit/python/opaque_obj.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/jit/python/python_arg_flatten.h>
 #include <torch/csrc/jit/python/python_custom_class.h>
@@ -1696,7 +1697,7 @@ void initJITBindings(PyObject* module) {
       [](const std::string& op_name, const std::string& overload_name) {
         try {
           auto symbol = Symbol::fromQualString(op_name);
-          auto operations = getAllOperatorsFor(symbol);
+          const auto& operations = getAllOperatorsFor(symbol);
           for (const auto& op : operations) {
             if (op->schema().overload_name() == overload_name) {
               return op->schema();
@@ -1717,7 +1718,7 @@ void initJITBindings(PyObject* module) {
          const std::string& overload_name) -> std::optional<py::tuple> {
         try {
           auto symbol = Symbol::fromQualString(op_name);
-          auto operations = getAllOperatorsFor(symbol);
+          const auto& operations = getAllOperatorsFor(symbol);
           bool allow_numbers_as_tensors = opAllowsNumbersAsTensors(symbol);
           for (const auto& op : operations) {
             if (op->schema().overload_name() == overload_name) {
@@ -1865,6 +1866,35 @@ void initJITBindings(PyObject* module) {
       &parseSchema,
       py::arg("schema"),
       py::arg("allow_typevars") = true);
+  m.def(
+      "_make_opaque_object",
+      [](py::object payload) {
+        auto obj = c10::make_intrusive<OpaqueObject>(payload);
+        auto typePtr =
+            torch::getCustomClass("__torch__.torch.classes.aten.OpaqueObject");
+        return torch::jit::toPyObject(c10::IValue(std::move(obj)));
+      },
+      R"doc(Creates an opaque object which stores the given Python object.)doc");
+  m.def(
+      "_get_opaque_object_payload",
+      [](py::object obj) {
+        auto typePtr =
+            torch::getCustomClass("__torch__.torch.classes.aten.OpaqueObject");
+        auto ivalue = torch::jit::toIValue(std::move(obj), typePtr);
+        auto customObj = ivalue.toCustomClass<OpaqueObject>();
+        return customObj->getPayload();
+      },
+      R"doc(Returns the Python object stored on the given opaque object.)doc");
+  m.def(
+      "_set_opaque_object_payload",
+      [](py::object obj, py::object payload) {
+        auto typePtr =
+            torch::getCustomClass("__torch__.torch.classes.aten.OpaqueObject");
+        auto ivalue = torch::jit::toIValue(std::move(obj), typePtr);
+        auto customObj = ivalue.toCustomClass<OpaqueObject>();
+        customObj->setPayload(std::move(payload));
+      },
+      R"doc(Sets the payload of the given opaque object with the given Python object.)doc");
   m.def("unify_type_list", [](const std::vector<TypePtr>& types) {
     std::ostringstream s;
     auto type = unifyTypeList(types, s);
diff --git a/torch/csrc/jit/python/opaque_obj.h b/torch/csrc/jit/python/opaque_obj.h
new file mode 100644
index 0000000000000..a34d7135bdf10
--- /dev/null
+++ b/torch/csrc/jit/python/opaque_obj.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <string>
+
+#include <c10/macros/Macros.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/custom_class.h>
+
+namespace torch::jit {
+struct OpaqueObject : public CustomClassHolder {
+  OpaqueObject(py::object payload) : payload_(payload) {}
+
+  void setPayload(py::object payload) {
+    payload_ = payload;
+  }
+
+  py::object getPayload() {
+    return payload_;
+  }
+
+  py::object payload_;
+};
+
+static auto register_opaque_obj_class =
+    torch::class_<OpaqueObject>("aten", "OpaqueObject");
+
+} // namespace torch::jit
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index fa84ffcd68f08..dd27811d1028d 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -71,15 +71,15 @@ static void postSetStateValidate(const IValue& v) {
 c10::intrusive_ptr<c10::ivalue::Object> ObjLoaderFunc(
     const at::StrongTypePtr& type,
     IValue input) {
-  auto cls = type.type_->expect<at::ClassType>();
-  auto qn = cls->name();
-  size_t n = cls->numAttributes();
+  const auto& cls = type.type_->expectRef<at::ClassType>();
+  auto qn = cls.name();
+  size_t n = cls.numAttributes();
   if (checkHasValidSetGetState(cls)) {
     auto obj = c10::ivalue::Object::create(type, n);
     // XXX: Do not optimize __setstate__, so that we don't try to
     // specialize the class before it is initialized.
     GraphOptimizerEnabledGuard guard(false);
-    Function& set_state = cls->getMethod("__setstate__");
+    Function& set_state = cls.getMethod("__setstate__");
     // since we are in the middle of unpickling we might still have lists and
     // dicts that do not have accurate tags (e.g. they report they are
     // List[Any]). But we need to run __setstate__ which will check the input
@@ -96,7 +96,7 @@ c10::intrusive_ptr<c10::ivalue::Object> ObjLoaderFunc(
     auto dict = std::move(input).toGenericDict();
     auto obj = c10::ivalue::Object::create(type, n);
     for (const auto i : c10::irange(n)) {
-      obj->setSlot(i, dict.at(cls->getAttributeName(i)));
+      obj->setSlot(i, dict.at(cls.getAttributeName(i)));
     }
     return obj;
   }
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 2dc3f138ff76d..0622dbb5cd98e 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -110,7 +110,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
     pushGlobal(type_name.prefix(), type_name.name());
     push<PickleOpCode>(PickleOpCode::EMPTY_TUPLE);
     push<PickleOpCode>(PickleOpCode::NEWOBJ);
-    if (checkHasValidSetGetState(type)) {
+    if (checkHasValidSetGetState(*type)) {
       Function& getstate = type->getMethod("__getstate__");
       pushIValue(getstate({obj}));
     } else {
diff --git a/torch/csrc/jit/serialization/pickler_helper.cpp b/torch/csrc/jit/serialization/pickler_helper.cpp
index 261ae15d36e0a..66b51b07f8074 100644
--- a/torch/csrc/jit/serialization/pickler_helper.cpp
+++ b/torch/csrc/jit/serialization/pickler_helper.cpp
@@ -34,9 +34,9 @@ WriteableTensorData getWriteableTensorData(
   return result;
 }
 
-bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls) {
+bool checkHasValidSetGetState(const c10::ClassType& cls) {
   // Check that the schemas for __getstate__ and __setstate__ are correct
-  auto getstate = cls->findMethod("__getstate__");
+  auto getstate = cls.findMethod("__getstate__");
   if (getstate == nullptr) {
     return false;
   }
@@ -56,7 +56,7 @@ bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls) {
 
   // Check __setstate__ if the method exists
   //   __setstate__ is expected to be (self, T) -> None
-  auto setstate = cls->findMethod("__setstate__");
+  auto setstate = cls.findMethod("__setstate__");
   if (!setstate) {
     return false;
   }
diff --git a/torch/csrc/jit/serialization/pickler_helper.h b/torch/csrc/jit/serialization/pickler_helper.h
index 9a52585254eb1..b27d974a10e90 100644
--- a/torch/csrc/jit/serialization/pickler_helper.h
+++ b/torch/csrc/jit/serialization/pickler_helper.h
@@ -112,7 +112,7 @@ getWriteableTensorData(const at::Tensor& tensor, bool to_cpu = true);
 // if the cls has __getstate__/__setstate__
 // assert they have the right schema and return true,
 // otherwise return false
-bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls);
+bool checkHasValidSetGetState(const c10::ClassType& cls);
 
 // Declare BackendMeta serialization and deserialization function pointer types.
 using BackendMetaPtr = std::function<
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 0253a5588030c..9d23cf6d97c23 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -261,12 +261,9 @@ void Unpickler::run() {
 void Unpickler::setInput(size_t memo_id) {
   AT_ASSERT(!stack_.empty());
   if (memo_id >= memo_table_.size()) {
-    memo_table_.insert(
-        memo_table_.end(), memo_id - memo_table_.size(), IValue());
-    memo_table_.push_back(stack_.back());
-  } else {
-    memo_table_[memo_id] = stack_.back();
+    memo_table_.resize(memo_id + 1);
   }
+  memo_table_[memo_id] = stack_.back();
 }
 
 static std::vector<int64_t> tupleToIntList(const IValue& v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 88d86d639c686..0d51e11e446d1 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -885,7 +885,7 @@ ExprPtr PolynomialTransformer::insertIntoTerm(
   bool merged{false};
   for (const auto& component : term->variables()) {
     if (auto roundoff = isRoundOff(component, expr)) {
-      vars.push_back(roundoff);
+      vars.push_back(std::move(roundoff));
       merged = true;
     } else {
       vars.push_back(component);
@@ -897,10 +897,10 @@ ExprPtr PolynomialTransformer::insertIntoTerm(
   }
 
   if (vars.size() == 1 && immediateEquals(term->scalar(), 1)) {
-    return vars[0];
+    return std::move(vars[0]);
   }
 
-  return alloc<Term>(hasher_, term->scalar(), vars);
+  return alloc<Term>(hasher_, term->scalar(), std::move(vars));
 }
 
 ExprPtr PolynomialTransformer::mutate(const MulPtr& v) {
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index 5e9c7dd295608..e7ab494d18e32 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -225,7 +225,7 @@ std::vector<Shape> compute_shape_constant_pad_nd(
     auto pad_idx = pad.size() - ((i + 1) * 2);
     auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
     TORCH_CHECK(
-        new_dim > 0,
+        new_dim >= 0,
         "The input size ",
         input_sizes[l_diff + i],
         ", plus negative padding ",
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index c7f759cd077c9..4d6a538c07724 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -172,7 +172,7 @@ bool InputOutputEncoder::isSupportedScalarList(
   return true;
 }
 
-// This function returns a lambda which is is a custom-iterator-like getter.
+// This function returns a lambda which is a custom-iterator-like getter.
 // Each invocation of the lambda returns input values for one op.
 //
 // io_type is used to filter the ivalues between 'Shapes' and 'Concrete Args'.
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index aa7abe9433fe1..a371034c14173 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -220,8 +220,7 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
     TORCH_INTERNAL_ASSERT(
         !self->guard,
         "Trying to enter a new record_function_fast context but the guard is unexpectedly already set");
-    self->guard =
-        std::make_unique<at::RecordFunction>(at::RecordScope::FUNCTION);
+    auto scope = at::RecordScope::FUNCTION;
     std::vector<at::IValue> args;
     std::unordered_map<std::string, at::IValue> kwargs;
     bool profiler_need_input = torch::autograd::profiler::profilerEnabled() &&
@@ -251,9 +250,34 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
         if (THPUtils_checkString(value)) {
           ivalue = at::IValue(THPUtils_unpackString(value));
         } else {
+          // Handle other types (not strings, not lists)
           auto match = torch::jit::tryToInferPrimitiveType(value);
           if (match.success()) {
             ivalue = torch::jit::toIValue(value, match.type());
+          } else if (PyList_Check(value)) {
+            // Handle list of strings
+            bool all_strings = true;
+            std::vector<std::string> string_list;
+            Py_ssize_t list_size = PyList_Size(value);
+
+            for (Py_ssize_t i = 0; i < list_size; i++) {
+              PyObject* item = PyList_GetItem(value, i);
+              if (THPUtils_checkString(item)) {
+                string_list.push_back(THPUtils_unpackString(item));
+              } else {
+                all_strings = false;
+                break;
+              }
+            }
+
+            if (all_strings) {
+              c10::List<std::string> string_ivalue_list(string_list);
+              ivalue = at::IValue(string_ivalue_list);
+            } else {
+              TORCH_WARN(
+                  "Unable to infer type of value in the List for keyword: ",
+                  key_str);
+            }
           } else {
             TORCH_WARN("Unable to infer type of value for keyword: ", key_str);
             ivalue = at::IValue("NULL");
@@ -262,6 +286,17 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
         kwargs[key_str] = ivalue;
       }
     }
+    auto it = kwargs.find("scope");
+    if (it != kwargs.end()) {
+      auto value = it->second;
+      if (value.isString()) {
+        auto value_str = value.toStringRef();
+        if (value_str == "user_scope") {
+          scope = at::RecordScope::USER_SCOPE;
+        }
+      }
+    }
+    self->guard = std::make_unique<at::RecordFunction>(scope);
     self->guard->before(THPUtils_unpackString(self->name), &args, &kwargs);
   }
   Py_RETURN_NONE;
diff --git a/torch/csrc/stable/.clang-tidy b/torch/csrc/stable/.clang-tidy
new file mode 100644
index 0000000000000..f0e059164b9ca
--- /dev/null
+++ b/torch/csrc/stable/.clang-tidy
@@ -0,0 +1,9 @@
+# NOTE: Please don't disable inheritance from the parent to make sure that common checks get propagated.
+
+# This configuration prevents global namespace pollution in headers.
+---
+InheritParentConfig: true
+Checks: '
+google-global-names-in-headers,
+'
+...
diff --git a/torch/csrc/stable/accelerator.h b/torch/csrc/stable/accelerator.h
index e104107dbc5bf..879080237b4ea 100644
--- a/torch/csrc/stable/accelerator.h
+++ b/torch/csrc/stable/accelerator.h
@@ -5,10 +5,10 @@
 
 #include <memory>
 
-using DeleterFnPtr = void (*)(void*);
-
 namespace torch::stable::accelerator {
 
+using DeleterFnPtr = void (*)(void*);
+
 namespace {
 inline void delete_device_guard(void* ptr) {
   TORCH_ERROR_CODE_CHECK(
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h
index 4a11c7256bf4b..549b2b95ec41c 100644
--- a/torch/csrc/stable/ops.h
+++ b/torch/csrc/stable/ops.h
@@ -10,14 +10,12 @@
 #include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
 #include <torch/headeronly/core/ScalarType.h>
 
-using torch::stable::Tensor;
-
 namespace torch::stable {
 
 // We expect this to be the stable version of the empty_like op that takes in
 // no kwargs (device, dtype, layout, memory_format). We will add kwargs
 // support in the future.
-inline Tensor empty_like(const Tensor& self) {
+inline torch::stable::Tensor empty_like(const torch::stable::Tensor& self) {
   const auto num_args = 6;
   std::array<StableIValue, num_args> stack{
       from(self),
@@ -28,7 +26,7 @@ inline Tensor empty_like(const Tensor& self) {
       from(std::nullopt)};
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_call_dispatcher("aten::empty_like", "", stack.data()));
-  return to<Tensor>(stack[0]);
+  return to<torch::stable::Tensor>(stack[0]);
 }
 
 // We expect this to be the stable version of the fill_.Scalar op
@@ -36,7 +34,9 @@ inline Tensor empty_like(const Tensor& self) {
 // A subtle nuance is that `value` is typed as a double, but it is
 // actually a Scalar. This is because Scalar.h is currently not
 // header-only.
-inline Tensor fill_(const Tensor& self, double value) {
+inline torch::stable::Tensor fill_(
+    const torch::stable::Tensor& self,
+    double value) {
   TORCH_ERROR_CODE_CHECK(aoti_torch_aten_fill__Scalar(self.get(), value));
   return self;
 }
@@ -44,18 +44,22 @@ inline Tensor fill_(const Tensor& self, double value) {
 // We expect this to be the stable version of the narrow.default op.
 // narrow takes in a SymInt for start and length, but these are typed as
 // int64_t as SymInt is not yet header-only.
-inline Tensor narrow(Tensor& self, int64_t dim, int64_t start, int64_t length) {
+inline torch::stable::Tensor narrow(
+    torch::stable::Tensor& self,
+    int64_t dim,
+    int64_t start,
+    int64_t length) {
   AtenTensorHandle ret0 = nullptr;
 
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_aten_narrow(self.get(), dim, start, length, &ret0));
-  return Tensor(ret0);
+  return torch::stable::Tensor(ret0);
 }
 
 // We expect this to be a stable version of the new_empty op that takes in
 // only dtype information.
-inline Tensor new_empty(
-    const Tensor& self,
+inline torch::stable::Tensor new_empty(
+    const torch::stable::Tensor& self,
     std::vector<int64_t> size,
     std::optional<c10::ScalarType> dtype = std::nullopt) {
   int32_t device_type;
@@ -87,13 +91,13 @@ inline Tensor new_empty(
       nullptr, // pin_memory (nullptr for default)
       &ret0));
 
-  return Tensor(ret0);
+  return torch::stable::Tensor(ret0);
 }
 
 // We expect this to be a stable version of the new_zeros op that takes in
 // only dtype information.
-inline Tensor new_zeros(
-    const Tensor& self,
+inline torch::stable::Tensor new_zeros(
+    const torch::stable::Tensor& self,
     std::vector<int64_t> size,
     std::optional<c10::ScalarType> dtype = std::nullopt) {
   int32_t device_type;
@@ -125,7 +129,7 @@ inline Tensor new_zeros(
       nullptr, // pin_memory (nullptr for default)
       &ath));
 
-  return Tensor(ath);
+  return torch::stable::Tensor(ath);
 }
 
 // We expect this to be the stable version of the pad.default op.
@@ -133,8 +137,8 @@ inline Tensor new_zeros(
 // use std::vector<int64_t> because
 // (1) IntArrayRef is not yet header-only
 // (2) SymInt is not yet header-only
-inline Tensor pad(
-    const Tensor& self,
+inline torch::stable::Tensor pad(
+    const torch::stable::Tensor& self,
     std::vector<int64_t> pad,
     const std::string& mode = "constant",
     double value = 0.0) {
@@ -142,7 +146,7 @@ inline Tensor pad(
 
   TORCH_ERROR_CODE_CHECK(aoti_torch_aten_pad(
       self.get(), pad.data(), pad.size(), mode.c_str(), &value, &ret0));
-  return Tensor(ret0);
+  return torch::stable::Tensor(ret0);
 }
 
 // We expect the following two functions to be stable versions of the
@@ -154,11 +158,14 @@ inline Tensor pad(
 
 // This function is an overload to compute the maximum value along each slice of
 // `self` along a single dimension `dim`.
-inline Tensor amax(const Tensor& self, int64_t dim, bool keepdim = false) {
+inline torch::stable::Tensor amax(
+    const torch::stable::Tensor& self,
+    int64_t dim,
+    bool keepdim = false) {
   AtenTensorHandle ret = nullptr;
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_aten_amax(self.get(), &dim, 1, keepdim, &ret));
-  return Tensor(ret);
+  return torch::stable::Tensor(ret);
 }
 
 // This function is an overload to compute the maximum value along each slice of
@@ -166,8 +173,8 @@ inline Tensor amax(const Tensor& self, int64_t dim, bool keepdim = false) {
 // amax.default op takes in a SymInt[] as the dims argument, however dims is
 // typed as use std::vector<int64_t> here because (1) IntArrayRef is not yet
 // header-only (2) SymInt is not yet header-only
-inline Tensor amax(
-    const Tensor& self,
+inline torch::stable::Tensor amax(
+    const torch::stable::Tensor& self,
     std::vector<int64_t> dims,
     bool keepdim = false) {
   AtenTensorHandle ret = nullptr;
@@ -177,28 +184,55 @@ inline Tensor amax(
       static_cast<int64_t>(dims.size()),
       keepdim,
       &ret));
-  return Tensor(ret);
+  return torch::stable::Tensor(ret);
 }
 
 // We expect this to be the stable version of the transpose op with identical
 // semantics to the existing transpose.int op.
-inline Tensor transpose(const Tensor& self, int64_t dim0, int64_t dim1) {
+inline torch::stable::Tensor transpose(
+    const torch::stable::Tensor& self,
+    int64_t dim0,
+    int64_t dim1) {
   const auto num_args = 3;
   std::array<StableIValue, num_args> stack{from(self), from(dim0), from(dim1)};
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_call_dispatcher("aten::transpose", "int", stack.data()));
-  return to<Tensor>(stack[0]);
+  return to<torch::stable::Tensor>(stack[0]);
 }
 
 // We expect this to be the stable version of the zero_ op with identical
 // semantics to the existing zero_ op (except that it will not be called as
 // a tensor method but only as a function i.e. zero_(t) not t.zero_()).
-inline Tensor zero_(Tensor& self) {
+inline torch::stable::Tensor zero_(torch::stable::Tensor& self) {
   const auto num_args = 1;
   std::array<StableIValue, num_args> stack{from(self)};
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_call_dispatcher("aten::zero_", "", stack.data()));
-  return to<Tensor>(stack[0]);
+  return to<torch::stable::Tensor>(stack[0]);
+}
+
+// We expect this to be the stable version of the copy_ op with
+// identical semantics to the existing copy_ op.
+inline torch::stable::Tensor copy_(
+    torch::stable::Tensor& self,
+    const torch::stable::Tensor& src,
+    std::optional<bool> non_blocking = std::nullopt) {
+  const auto num_args = 3;
+  std::array<StableIValue, num_args> stack{
+      from(self), from(src), from(non_blocking.value_or(false))};
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_call_dispatcher("aten::copy_", "", stack.data()));
+  return to<torch::stable::Tensor>(stack[0]);
+}
+
+// We expect this to be the stable version of the clone op. We will
+// add optional memory_format kwarg support in the future.
+inline torch::stable::Tensor clone(const torch::stable::Tensor& self) {
+  const auto num_args = 2;
+  std::array<StableIValue, num_args> stack{from(self), from(std::nullopt)};
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_call_dispatcher("aten::clone", "", stack.data()));
+  return to<torch::stable::Tensor>(stack[0]);
 }
 
 } // namespace torch::stable
diff --git a/torch/csrc/utils/generated_serialization_types.h b/torch/csrc/utils/generated_serialization_types.h
index bec4e283dcac8..3090d58f5c094 100644
--- a/torch/csrc/utils/generated_serialization_types.h
+++ b/torch/csrc/utils/generated_serialization_types.h
@@ -10,6 +10,7 @@
 #include <unordered_map>
 #include <variant>
 #include <vector>
+#include <c10/util/Exception.h>
 
 #include <nlohmann/json.hpp>
 
@@ -190,7 +191,7 @@ inline std::string_view printEnum(const ArgumentKind& e) {
     case ArgumentKind::POSITIONAL: return "POSITIONAL";
     case ArgumentKind::KEYWORD: return "KEYWORD";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
@@ -198,7 +199,7 @@ inline void parseEnum(std::string_view s, ArgumentKind& t) {
   if (s == "UNKNOWN") { t = ArgumentKind::UNKNOWN; return; }
   if (s == "POSITIONAL") { t = ArgumentKind::POSITIONAL; return; }
   if (s == "KEYWORD") { t = ArgumentKind::KEYWORD; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 enum class Layout {
@@ -223,7 +224,7 @@ inline std::string_view printEnum(const Layout& e) {
     case Layout::_mkldnn: return "_mkldnn";
     case Layout::Strided: return "Strided";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
@@ -236,7 +237,7 @@ inline void parseEnum(std::string_view s, Layout& t) {
   if (s == "SparseBsc") { t = Layout::SparseBsc; return; }
   if (s == "_mkldnn") { t = Layout::_mkldnn; return; }
   if (s == "Strided") { t = Layout::Strided; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 enum class MemoryFormat {
@@ -255,7 +256,7 @@ inline std::string_view printEnum(const MemoryFormat& e) {
     case MemoryFormat::ChannelsLast3d: return "ChannelsLast3d";
     case MemoryFormat::PreserveFormat: return "PreserveFormat";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
@@ -265,7 +266,7 @@ inline void parseEnum(std::string_view s, MemoryFormat& t) {
   if (s == "ChannelsLast") { t = MemoryFormat::ChannelsLast; return; }
   if (s == "ChannelsLast3d") { t = MemoryFormat::ChannelsLast3d; return; }
   if (s == "PreserveFormat") { t = MemoryFormat::PreserveFormat; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 enum class ScalarType {
@@ -312,7 +313,7 @@ inline std::string_view printEnum(const ScalarType& e) {
     case ScalarType::FLOAT8E4M3FNUZ: return "FLOAT8E4M3FNUZ";
     case ScalarType::FLOAT8E5M2FNUZ: return "FLOAT8E5M2FNUZ";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
@@ -336,7 +337,7 @@ inline void parseEnum(std::string_view s, ScalarType& t) {
   if (s == "FLOAT8E5M2") { t = ScalarType::FLOAT8E5M2; return; }
   if (s == "FLOAT8E4M3FNUZ") { t = ScalarType::FLOAT8E4M3FNUZ; return; }
   if (s == "FLOAT8E5M2FNUZ") { t = ScalarType::FLOAT8E5M2FNUZ; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -453,7 +454,7 @@ inline std::string_view printEnum(const SymExprHint::Tag& e) {
     case SymExprHint::Tag::AS_BOOL: return "AS_BOOL";
     case SymExprHint::Tag::AS_FLOAT: return "AS_FLOAT";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
@@ -461,7 +462,7 @@ inline void parseEnum(std::string_view s, SymExprHint::Tag& t) {
   if (s == "AS_INT") { t = SymExprHint::Tag::AS_INT; return; }
   if (s == "AS_BOOL") { t = SymExprHint::Tag::AS_BOOL; return; }
   if (s == "AS_FLOAT") { t = SymExprHint::Tag::AS_FLOAT; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -559,14 +560,14 @@ inline std::string_view printEnum(const SymInt::Tag& e) {
     case SymInt::Tag::AS_EXPR: return "AS_EXPR";
     case SymInt::Tag::AS_INT: return "AS_INT";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
 inline void parseEnum(std::string_view s, SymInt::Tag& t) {
   if (s == "AS_EXPR") { t = SymInt::Tag::AS_EXPR; return; }
   if (s == "AS_INT") { t = SymInt::Tag::AS_INT; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -637,14 +638,14 @@ inline std::string_view printEnum(const SymFloat::Tag& e) {
     case SymFloat::Tag::AS_EXPR: return "AS_EXPR";
     case SymFloat::Tag::AS_FLOAT: return "AS_FLOAT";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
 inline void parseEnum(std::string_view s, SymFloat::Tag& t) {
   if (s == "AS_EXPR") { t = SymFloat::Tag::AS_EXPR; return; }
   if (s == "AS_FLOAT") { t = SymFloat::Tag::AS_FLOAT; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -715,14 +716,14 @@ inline std::string_view printEnum(const SymBool::Tag& e) {
     case SymBool::Tag::AS_EXPR: return "AS_EXPR";
     case SymBool::Tag::AS_BOOL: return "AS_BOOL";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
 inline void parseEnum(std::string_view s, SymBool::Tag& t) {
   if (s == "AS_EXPR") { t = SymBool::Tag::AS_EXPR; return; }
   if (s == "AS_BOOL") { t = SymBool::Tag::AS_BOOL; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -865,14 +866,14 @@ inline std::string_view printEnum(const SymIntArgument::Tag& e) {
     case SymIntArgument::Tag::AS_NAME: return "AS_NAME";
     case SymIntArgument::Tag::AS_INT: return "AS_INT";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
 inline void parseEnum(std::string_view s, SymIntArgument::Tag& t) {
   if (s == "AS_NAME") { t = SymIntArgument::Tag::AS_NAME; return; }
   if (s == "AS_INT") { t = SymIntArgument::Tag::AS_INT; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -943,14 +944,14 @@ inline std::string_view printEnum(const SymFloatArgument::Tag& e) {
     case SymFloatArgument::Tag::AS_NAME: return "AS_NAME";
     case SymFloatArgument::Tag::AS_FLOAT: return "AS_FLOAT";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
 inline void parseEnum(std::string_view s, SymFloatArgument::Tag& t) {
   if (s == "AS_NAME") { t = SymFloatArgument::Tag::AS_NAME; return; }
   if (s == "AS_FLOAT") { t = SymFloatArgument::Tag::AS_FLOAT; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -1021,14 +1022,14 @@ inline std::string_view printEnum(const SymBoolArgument::Tag& e) {
     case SymBoolArgument::Tag::AS_NAME: return "AS_NAME";
     case SymBoolArgument::Tag::AS_BOOL: return "AS_BOOL";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
 inline void parseEnum(std::string_view s, SymBoolArgument::Tag& t) {
   if (s == "AS_NAME") { t = SymBoolArgument::Tag::AS_NAME; return; }
   if (s == "AS_BOOL") { t = SymBoolArgument::Tag::AS_BOOL; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -1135,14 +1136,14 @@ inline std::string_view printEnum(const OptionalTensorArgument::Tag& e) {
     case OptionalTensorArgument::Tag::AS_TENSOR: return "AS_TENSOR";
     case OptionalTensorArgument::Tag::AS_NONE: return "AS_NONE";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
 inline void parseEnum(std::string_view s, OptionalTensorArgument::Tag& t) {
   if (s == "AS_TENSOR") { t = OptionalTensorArgument::Tag::AS_TENSOR; return; }
   if (s == "AS_NONE") { t = OptionalTensorArgument::Tag::AS_NONE; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -1769,7 +1770,7 @@ inline std::string_view printEnum(const Argument::Tag& e) {
     case Argument::Tag::AS_OPTIONAL_TENSOR: return "AS_OPTIONAL_TENSOR";
     case Argument::Tag::AS_COMPLEX: return "AS_COMPLEX";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
@@ -1801,7 +1802,7 @@ inline void parseEnum(std::string_view s, Argument::Tag& t) {
   if (s == "AS_SYM_FLOATS") { t = Argument::Tag::AS_SYM_FLOATS; return; }
   if (s == "AS_OPTIONAL_TENSOR") { t = Argument::Tag::AS_OPTIONAL_TENSOR; return; }
   if (s == "AS_COMPLEX") { t = Argument::Tag::AS_COMPLEX; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -2127,7 +2128,7 @@ inline std::string_view printEnum(const ConstantValue::Tag& e) {
     case ConstantValue::Tag::AS_STRING: return "AS_STRING";
     case ConstantValue::Tag::AS_BOOL: return "AS_BOOL";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
@@ -2137,7 +2138,7 @@ inline void parseEnum(std::string_view s, ConstantValue::Tag& t) {
   if (s == "AS_FLOAT") { t = ConstantValue::Tag::AS_FLOAT; return; }
   if (s == "AS_STRING") { t = ConstantValue::Tag::AS_STRING; return; }
   if (s == "AS_BOOL") { t = ConstantValue::Tag::AS_BOOL; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -2465,7 +2466,7 @@ inline std::string_view printEnum(const InputSpec::Tag& e) {
     case InputSpec::Tag::TOKEN: return "TOKEN";
     case InputSpec::Tag::CONSTANT_INPUT: return "CONSTANT_INPUT";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
@@ -2477,7 +2478,7 @@ inline void parseEnum(std::string_view s, InputSpec::Tag& t) {
   if (s == "CUSTOM_OBJ") { t = InputSpec::Tag::CUSTOM_OBJ; return; }
   if (s == "TOKEN") { t = InputSpec::Tag::TOKEN; return; }
   if (s == "CONSTANT_INPUT") { t = InputSpec::Tag::CONSTANT_INPUT; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
@@ -2851,7 +2852,7 @@ inline std::string_view printEnum(const OutputSpec::Tag& e) {
     case OutputSpec::Tag::TOKEN: return "TOKEN";
     case OutputSpec::Tag::PARAMETER_MUTATION: return "PARAMETER_MUTATION";
     default:
-      throw std::runtime_error("Unknown enum value");
+      TORCH_CHECK(false, "Unknown enum value");
   }
 }
 
@@ -2864,7 +2865,7 @@ inline void parseEnum(std::string_view s, OutputSpec::Tag& t) {
   if (s == "USER_INPUT_MUTATION") { t = OutputSpec::Tag::USER_INPUT_MUTATION; return; }
   if (s == "TOKEN") { t = OutputSpec::Tag::TOKEN; return; }
   if (s == "PARAMETER_MUTATION") { t = OutputSpec::Tag::PARAMETER_MUTATION; return; }
-  throw std::runtime_error("Unknown enum value: " + std::string{s});
+  TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
 }
 
 
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 613657e03b926..a51cfaf8c5c1c 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -131,9 +131,8 @@ FunctionParameter::FunctionParameter(const std::string& fmt, bool keyword_only)
       size(0),
       default_scalar(0) {
   auto space = fmt.find(' ');
-  if (space == std::string::npos) {
-    throw std::runtime_error("FunctionParameter(): missing type: " + fmt);
-  }
+  TORCH_CHECK(
+      space != std::string::npos, "FunctionParameter(): missing type: " + fmt);
 
   auto type_str = fmt.substr(0, space);
 
@@ -154,10 +153,9 @@ FunctionParameter::FunctionParameter(const std::string& fmt, bool keyword_only)
 
   auto name_str = fmt.substr(space + 1);
   auto it = type_map.find(type_str);
-  if (it == type_map.end()) {
-    throw std::runtime_error(
-        "FunctionParameter(): invalid type string: " + type_str);
-  }
+  TORCH_CHECK(
+      it != type_map.end(),
+      "FunctionParameter(): invalid type string: " + type_str);
   type_ = it->second;
 
   auto eq = name_str.find('=');
@@ -938,6 +936,9 @@ static bool is_int_or_symint(PyObject* obj) {
   if (torch::is_symint(py::handle(obj))) {
     return true;
   }
+  if (torch::is_dynint(py::handle(obj))) {
+    return true;
+  }
 
   // FakeTensor(..., size=()) is qualified for SymInt param,
   // but we can't go via __index__ (below) as we would normally
@@ -1072,7 +1073,8 @@ auto FunctionParameter::_check(
         return !var.requires_grad() && var.dim() == 0;
       }
       if (torch::is_symfloat(py::handle(obj)) ||
-          torch::is_symint(py::handle(obj))) {
+          torch::is_symint(py::handle(obj)) ||
+          torch::is_dynint(py::handle(obj))) {
         // This will induce a guard
         return true;
       }
@@ -1087,7 +1089,8 @@ auto FunctionParameter::_check(
         return at::isIntegralType(var.scalar_type(), /*includeBool=*/false) &&
             !var.requires_grad() && var.dim() == 0;
       }
-      if (torch::is_symint(py::handle(obj))) {
+      if (torch::is_symint(py::handle(obj)) ||
+          torch::is_dynint(py::handle(obj))) {
         // This will induce a guard
         return true;
       }
@@ -1129,7 +1132,8 @@ auto FunctionParameter::_check(
       // Allow symint to be passed in as device, but we'll specialize and
       // guard in this case.
       return THPUtils_checkLong(obj) || THPUtils_checkString(obj) ||
-          THPDevice_Check(obj) || torch::is_symint(py::handle(obj));
+          THPDevice_Check(obj) || torch::is_symint(py::handle(obj)) ||
+          torch::is_dynint(py::handle(obj));
     case ParameterType::STREAM:
       return THPStream_Check(obj);
     case ParameterType::STRING:
@@ -1145,7 +1149,7 @@ auto FunctionParameter::_check(
     case ParameterType::DISPATCH_KEY_SET:
       return py::isinstance<c10::DispatchKeySet>(py::handle(obj));
     default:
-      throw std::runtime_error("unknown parameter type");
+      TORCH_CHECK(false, "unknown parameter type");
   }
 }
 
@@ -1202,7 +1206,7 @@ std::string FunctionParameter::type_name() const {
     case ParameterType::DISPATCH_KEY_SET:
       return "DispatchKeySet";
     default:
-      throw std::runtime_error("unknown parameter type");
+      TORCH_CHECK(false, "unknown parameter type");
   }
 }
 
@@ -1324,10 +1328,8 @@ void FunctionParameter::set_default_str(const std::string& str) {
   }
   if (type_ == ParameterType::TENSOR ||
       type_ == ParameterType::DISPATCH_KEY_SET) {
-    if (str != "None") {
-      throw std::runtime_error(
-          "default value for Tensor must be none, got: " + str);
-    }
+    TORCH_CHECK(
+        str == "None", "default value for Tensor must be none, got: " + str);
   } else if (type_ == ParameterType::INT64 || type_ == ParameterType::SYM_INT) {
     default_int = atol(str.c_str());
   } else if (type_ == ParameterType::BOOL) {
@@ -1351,16 +1353,14 @@ void FunctionParameter::set_default_str(const std::string& str) {
       default_intlist = parse_intlist_args(str, size);
     }
   } else if (type_ == ParameterType::FLOAT_LIST) {
-    if (str != "None") {
-      throw std::runtime_error("Defaults not supported for float[]");
-    }
+    TORCH_CHECK(str == "None", "Defaults not supported for float[]");
   } else if (type_ == ParameterType::SCALARTYPE) {
     if (str == "None") {
       default_scalartype = at::ScalarType::Undefined;
     } else if (str == "torch.int64") {
       default_scalartype = at::ScalarType::Long;
     } else {
-      throw std::runtime_error("invalid default value for ScalarType: " + str);
+      TORCH_CHECK(false, "invalid default value for ScalarType: " + str);
     }
   } else if (type_ == ParameterType::LAYOUT) {
     if (str == "None") {
@@ -1370,16 +1370,12 @@ void FunctionParameter::set_default_str(const std::string& str) {
     } else if (str == "torch.sparse_coo") {
       default_layout = at::Layout::Sparse;
     } else {
-      throw std::runtime_error("invalid default value for layout: " + str);
+      TORCH_CHECK(false, "invalid default value for layout: " + str);
     }
   } else if (type_ == ParameterType::DEVICE) {
-    if (str != "None") {
-      throw std::runtime_error("invalid device: " + str);
-    }
+    TORCH_CHECK(str == "None", "invalid device: " + str);
   } else if (type_ == ParameterType::STREAM) {
-    if (str != "None") {
-      throw std::runtime_error("invalid stream: " + str);
-    }
+    TORCH_CHECK(str == "None", "invalid stream: " + str);
   } else if (type_ == ParameterType::STRING) {
     if (str != "None") {
       default_string = parse_string_literal(str);
@@ -1408,7 +1404,7 @@ void FunctionParameter::set_default_str(const std::string& str) {
   } else if (type_ == ParameterType::QSCHEME) { // NOLINT
     // throw std::runtime_error("ParameterType::QSCHEME");
   } else {
-    throw std::runtime_error("unknown parameter type");
+    TORCH_CHECK(false, "unknown parameter type");
   }
   default_value = str;
 }
@@ -1423,7 +1419,7 @@ FunctionSignature::FunctionSignature(const std::string& fmt, int index)
       deprecated(false) {
   auto open_paren = fmt.find('(');
   if (open_paren == std::string::npos) {
-    throw std::runtime_error("missing opening parenthesis: " + fmt);
+    TORCH_CHECK(false, "missing opening parenthesis: " + fmt);
   }
   name = fmt.substr(0, open_paren);
 
@@ -1445,12 +1441,9 @@ FunctionSignature::FunctionSignature(const std::string& fmt, int index)
         break;
       }
     }
-    if (offset == std::string::npos) {
-      throw std::runtime_error("missing closing parenthesis: " + fmt);
-    }
-    if (offset == last_offset) {
-      throw std::runtime_error("malformed signature: " + fmt);
-    }
+    TORCH_CHECK(
+        offset != std::string::npos, "missing closing parenthesis: " + fmt);
+    TORCH_CHECK(offset != last_offset, "malformed signature: " + fmt);
 
     auto param_str = fmt.substr(last_offset, offset - last_offset);
     last_offset = next_offset;
@@ -1894,7 +1887,8 @@ at::Tensor PythonArgs::tensor_slow(int i) {
     // NB: we DO NOT put symbolic ints/floats into the Scalar itself,
     // because although Scalar supports SymInt/SymFloat, the subsequent
     // conversion to Tensor does not.  Instead, do it out of band.
-  } else if (torch::is_symint(py::handle(obj))) {
+  } else if (
+      torch::is_symint(py::handle(obj)) || torch::is_dynint(py::handle(obj))) {
     save_symint = true;
     // This scalar value doesn't matter, it shouldn't ever actually
     // get read out.  Make it a big and weird looking number to help
@@ -1982,6 +1976,10 @@ at::Scalar PythonArgs::scalar_slow(PyObject* arg) {
     return at::Scalar(py::cast<c10::SymInt>(arg));
   }
 
+  if (torch::is_dynint(arg)) {
+    return at::Scalar(py::cast<int>(arg));
+  }
+
   if (torch::is_symfloat(arg)) {
     return at::Scalar(py::cast<c10::SymFloat>(arg));
   }
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index a81f861ae9030..5887235f72e50 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -89,7 +89,7 @@ inline bool THPUtils_checkScalar(PyObject* obj) {
   }
 #endif
   return PyFloat_Check(obj) || PyLong_Check(obj) || PyComplex_Check(obj) ||
-      torch::is_symint(py::handle(obj)) ||
+      torch::is_symint(py::handle(obj)) || torch::is_dynint(py::handle(obj)) ||
       torch::is_symfloat(py::handle(obj)) || torch::is_symbool(py::handle(obj));
 }
 
@@ -612,6 +612,8 @@ inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
         try {
           if (is_symint(py::handle(obj))) {
             res.push_back(py::handle(obj).cast<c10::SymInt>());
+          } else if (is_dynint(py::handle(obj))) {
+            res.push_back(py::handle(obj).cast<int>());
           } else {
             res.emplace_back(THPUtils_unpackIndex(obj));
           }
@@ -640,6 +642,9 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(
         size1,
         py::handle(arg).cast<c10::SymInt>().guard_int(__FILE__, __LINE__));
   }
+  if (size1 > 0 && torch::is_dynint(py::handle(arg))) {
+    return std::vector<int64_t>(size1, py::handle(arg).cast<int>());
+  }
   auto tuple = PyTuple_Check(arg);
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
@@ -672,6 +677,8 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(
       } else if (torch::is_symint(py::handle(obj))) {
         res[idx] = py::cast<c10::SymInt>(py::handle(obj))
                        .guard_int(__FILE__, __LINE__);
+      } else if (torch::is_dynint(py::handle(obj))) {
+        res[idx] = py::handle(obj).cast<int>();
       } else if (THPVariable_Check(obj)) {
         auto& var = THPVariable_Unpack(obj);
         if (var.numel() != 1 ||
@@ -846,6 +853,10 @@ inline at::Device toDevice(PyObject* obj) {
         py::cast<c10::SymInt>(py::handle(obj)).guard_int(__FILE__, __LINE__);
     return deviceFromLong(device_index);
   }
+  if (torch::is_dynint(py::handle(obj))) {
+    auto device_index = py::cast<int>(py::handle(obj));
+    return deviceFromLong(device_index);
+  }
   const std::string& device_str = THPUtils_unpackString(obj);
   return at::Device(device_str);
 }
@@ -982,6 +993,9 @@ inline int64_t PythonArgs::toInt64(int i) {
     return py::cast<c10::SymInt>(py::handle(args[i]))
         .guard_int(__FILE__, __LINE__);
   }
+  if (torch::is_dynint(py::handle(args[i]))) {
+    return py::cast<int>(py::handle(args[i]));
+  }
   return THPUtils_unpackLong(args[i]);
 }
 
@@ -1055,6 +1069,9 @@ inline double PythonArgs::toDouble(int i) {
     return static_cast<double>(py::cast<c10::SymInt>(py::handle(args[i]))
                                    .guard_int(__FILE__, __LINE__));
   }
+  if (torch::is_dynint(py::handle(args[i]))) {
+    return static_cast<double>(py::cast<int>(py::handle(args[i])));
+  }
   return THPUtils_unpackDouble(args[i]);
 }
 
diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
index a8b9b8632a00b..c7e6cc29bf783 100644
--- a/torch/csrc/utils/python_numbers.h
+++ b/torch/csrc/utils/python_numbers.h
@@ -120,7 +120,7 @@ inline bool THPUtils_unpackBool(PyObject* obj) {
   } else if (obj == Py_False) {
     return false;
   } else {
-    throw std::runtime_error("couldn't convert python object to boolean");
+    TORCH_CHECK(false, "couldn't convert python object to boolean");
   }
 }
 
@@ -199,13 +199,11 @@ inline c10::DeviceIndex THPUtils_unpackDeviceIndex(PyObject* obj) {
   if (value == -1 && PyErr_Occurred()) {
     throw python_error();
   }
-  if (overflow != 0) {
-    throw std::runtime_error("Overflow when unpacking DeviceIndex");
-  }
-  if (value > std::numeric_limits<c10::DeviceIndex>::max() ||
-      value < std::numeric_limits<c10::DeviceIndex>::min()) {
-    throw std::runtime_error("Overflow when unpacking DeviceIndex");
-  }
+  TORCH_CHECK(overflow == 0, "Overflow when unpacking DeviceIndex");
+  TORCH_CHECK(
+      value <= std::numeric_limits<c10::DeviceIndex>::max() &&
+          value >= std::numeric_limits<c10::DeviceIndex>::min(),
+      "Overflow when unpacking DeviceIndex");
   return (c10::DeviceIndex)value;
 }
 
diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h
index 89ce38353bebc..c5e19f8855d5e 100644
--- a/torch/csrc/utils/python_scalars.h
+++ b/torch/csrc/utils/python_scalars.h
@@ -101,7 +101,7 @@ inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
           at::convert<at::Float8_e8m0fnu, double>(THPUtils_unpackDouble(obj));
       break;
     default:
-      throw std::runtime_error("store_scalar: invalid type");
+      TORCH_CHECK(false, "store_scalar: invalid type");
   }
 }
 
@@ -165,7 +165,7 @@ inline PyObject* load_scalar(const void* data, at::ScalarType scalarType) {
       return PyFloat_FromDouble(
           at::convert<double, at::Float8_e8m0fnu>(*(at::Float8_e8m0fnu*)data));
     default:
-      throw std::runtime_error("load_scalar: invalid type");
+      TORCH_CHECK(false, "load_scalar: invalid type");
   }
 }
 
diff --git a/torch/csrc/utils/python_strings.h b/torch/csrc/utils/python_strings.h
index 1d26c4333bc2b..229734af238f4 100644
--- a/torch/csrc/utils/python_strings.h
+++ b/torch/csrc/utils/python_strings.h
@@ -26,12 +26,10 @@ inline std::string THPUtils_unpackString(PyObject* obj) {
   if (PyUnicode_Check(obj)) {
     Py_ssize_t size = 0;
     const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
-    if (!data) {
-      throw std::runtime_error("error unpacking string as utf-8");
-    }
+    TORCH_CHECK(data, "error unpacking string as utf-8");
     return std::string(data, (size_t)size);
   }
-  throw std::runtime_error("unpackString: expected bytes or unicode object");
+  TORCH_CHECK(false, "unpackString: expected bytes or unicode object");
 }
 
 // Unpacks PyBytes (PyString) or PyUnicode as std::string_view
@@ -50,12 +48,10 @@ inline std::string_view THPUtils_unpackStringView(PyObject* obj) {
   if (PyUnicode_Check(obj)) {
     Py_ssize_t size = 0;
     const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
-    if (!data) {
-      throw std::runtime_error("error unpacking string as utf-8");
-    }
+    TORCH_CHECK(data, "error unpacking string as utf-8");
     return std::string_view(data, (size_t)size);
   }
-  throw std::runtime_error("unpackString: expected bytes or unicode object");
+  TORCH_CHECK(false, "unpackString: expected bytes or unicode object");
 }
 
 inline PyObject* THPUtils_packString(const char* str) {
diff --git a/torch/csrc/utils/python_symnode.cpp b/torch/csrc/utils/python_symnode.cpp
index 2c12e730abb18..9e17f8166a4b9 100644
--- a/torch/csrc/utils/python_symnode.cpp
+++ b/torch/csrc/utils/python_symnode.cpp
@@ -53,4 +53,24 @@ py::handle get_symbool_class() {
 #endif
 }
 
+py::handle get_dynint_class() {
+  // NB: leak
+#if IS_PYBIND_2_13_PLUS
+  PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object>
+      storage;
+  return storage
+      .call_once_and_store_result([]() -> py::object {
+        return py::module::import("torch.fx.experimental.sym_node")
+            .attr("DynamicInt");
+      })
+      .get_stored();
+#else
+  static py::handle symbool_class =
+      py::object(py::module::import("torch.fx.experimental.sym_node")
+                     .attr("DynamicInt"))
+          .release();
+  return symbool_class;
+#endif
+}
+
 } // namespace torch
diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
index 69d03b9b7a439..4b02374467700 100644
--- a/torch/csrc/utils/python_symnode.h
+++ b/torch/csrc/utils/python_symnode.h
@@ -12,6 +12,7 @@ namespace torch {
 TORCH_PYTHON_API py::handle get_symint_class();
 TORCH_PYTHON_API py::handle get_symfloat_class();
 TORCH_PYTHON_API py::handle get_symbool_class();
+TORCH_PYTHON_API py::handle get_dynint_class();
 
 // NB: These functions must not be called too early, otherwise torch not setup.
 // Alternate design is to have torch "register" the object to us
@@ -24,6 +25,9 @@ inline bool is_symfloat(py::handle obj) {
 inline bool is_symbool(py::handle obj) {
   return py::isinstance(obj, get_symbool_class());
 }
+inline bool is_dynint(py::handle obj) {
+  return py::isinstance(obj, get_dynint_class());
+}
 
 namespace impl {
 
diff --git a/torch/csrc/utils/pythoncapi_compat.h b/torch/csrc/utils/pythoncapi_compat.h
index c0feaa20904dd..05e80b5ee8607 100644
--- a/torch/csrc/utils/pythoncapi_compat.h
+++ b/torch/csrc/utils/pythoncapi_compat.h
@@ -68,16 +68,6 @@ static inline PyObject* _Py_XNewRef(PyObject *obj)
 #endif
 
 
-// bpo-39573 added Py_SET_REFCNT() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_REFCNT)
-static inline void _Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
-{
-    ob->ob_refcnt = refcnt;
-}
-#define Py_SET_REFCNT(ob, refcnt) _Py_SET_REFCNT(_PyObject_CAST(ob), refcnt)
-#endif
-
-
 // Py_SETREF() and Py_XSETREF() were added to Python 3.5.2.
 // It is excluded from the limited C API.
 #if (PY_VERSION_HEX < 0x03050200 && !defined(Py_SETREF)) && !defined(Py_LIMITED_API)
@@ -114,37 +104,6 @@ static inline void _Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
 #  define Py_IsFalse(x) Py_Is(x, Py_False)
 #endif
 
-
-// bpo-39573 added Py_SET_TYPE() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
-static inline void _Py_SET_TYPE(PyObject *ob, PyTypeObject *type)
-{
-    ob->ob_type = type;
-}
-#define Py_SET_TYPE(ob, type) _Py_SET_TYPE(_PyObject_CAST(ob), type)
-#endif
-
-
-// bpo-39573 added Py_SET_SIZE() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_SIZE)
-static inline void _Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size)
-{
-    ob->ob_size = size;
-}
-#define Py_SET_SIZE(ob, size) _Py_SET_SIZE((PyVarObject*)(ob), size)
-#endif
-
-
-// bpo-40421 added PyFrame_GetCode() to Python 3.9.0b1
-#if PY_VERSION_HEX < 0x030900B1 || defined(PYPY_VERSION)
-static inline PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
-{
-    assert(frame != _Py_NULL);
-    assert(frame->f_code != _Py_NULL);
-    return _Py_CAST(PyCodeObject*, Py_NewRef(frame->f_code));
-}
-#endif
-
 static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
 {
     PyCodeObject *code = PyFrame_GetCode(frame);
@@ -153,15 +112,6 @@ static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
 }
 
 
-// bpo-40421 added PyFrame_GetBack() to Python 3.9.0b1
-#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
-static inline PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
-{
-    assert(frame != _Py_NULL);
-    return _Py_CAST(PyFrameObject*, Py_XNewRef(frame->f_back));
-}
-#endif
-
 #if !defined(PYPY_VERSION)
 static inline PyFrameObject* _PyFrame_GetBackBorrow(PyFrameObject *frame)
 {
@@ -279,26 +229,6 @@ PyFrame_GetVarString(PyFrameObject *frame, const char *name)
 #endif
 
 
-// bpo-39947 added PyThreadState_GetInterpreter() to Python 3.9.0a5
-#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
-static inline PyInterpreterState *
-PyThreadState_GetInterpreter(PyThreadState *tstate)
-{
-    assert(tstate != _Py_NULL);
-    return tstate->interp;
-}
-#endif
-
-
-// bpo-40429 added PyThreadState_GetFrame() to Python 3.9.0b1
-#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
-static inline PyFrameObject* PyThreadState_GetFrame(PyThreadState *tstate)
-{
-    assert(tstate != _Py_NULL);
-    return _Py_CAST(PyFrameObject *, Py_XNewRef(tstate->frame));
-}
-#endif
-
 #if !defined(PYPY_VERSION)
 static inline PyFrameObject*
 _PyThreadState_GetFrameBorrow(PyThreadState *tstate)
@@ -310,35 +240,6 @@ _PyThreadState_GetFrameBorrow(PyThreadState *tstate)
 #endif
 
 
-// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a5
-#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
-static inline PyInterpreterState* PyInterpreterState_Get(void)
-{
-    PyThreadState *tstate;
-    PyInterpreterState *interp;
-
-    tstate = PyThreadState_GET();
-    if (tstate == _Py_NULL) {
-        Py_FatalError("GIL released (tstate is NULL)");
-    }
-    interp = tstate->interp;
-    if (interp == _Py_NULL) {
-        Py_FatalError("no current interpreter");
-    }
-    return interp;
-}
-#endif
-
-
-// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a6
-#if 0x030700A1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
-static inline uint64_t PyThreadState_GetID(PyThreadState *tstate)
-{
-    assert(tstate != _Py_NULL);
-    return tstate->id;
-}
-#endif
-
 // bpo-43760 added PyThreadState_EnterTracing() to Python 3.11.0a2
 #if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
 static inline void PyThreadState_EnterTracing(PyThreadState *tstate)
@@ -368,27 +269,6 @@ static inline void PyThreadState_LeaveTracing(PyThreadState *tstate)
 #endif
 
 
-// bpo-37194 added PyObject_CallNoArgs() to Python 3.9.0a1
-// PyObject_CallNoArgs() added to PyPy 3.9.16-v7.3.11
-#if !defined(PyObject_CallNoArgs) && PY_VERSION_HEX < 0x030900A1
-static inline PyObject* PyObject_CallNoArgs(PyObject *func)
-{
-    return PyObject_CallFunctionObjArgs(func, NULL);
-}
-#endif
-
-
-// bpo-39245 made PyObject_CallOneArg() public (previously called
-// _PyObject_CallOneArg) in Python 3.9.0a4
-// PyObject_CallOneArg() added to PyPy 3.9.16-v7.3.11
-#if !defined(PyObject_CallOneArg) && PY_VERSION_HEX < 0x030900A4
-static inline PyObject* PyObject_CallOneArg(PyObject *func, PyObject *arg)
-{
-    return PyObject_CallFunctionObjArgs(func, arg, NULL);
-}
-#endif
-
-
 // bpo-1635741 added PyModule_AddObjectRef() to Python 3.10.0a3
 #if PY_VERSION_HEX < 0x030A00A3
 static inline int
@@ -414,58 +294,6 @@ PyModule_AddObjectRef(PyObject *module, const char *name, PyObject *value)
 #endif
 
 
-// bpo-40024 added PyModule_AddType() to Python 3.9.0a5
-#if PY_VERSION_HEX < 0x030900A5
-static inline int PyModule_AddType(PyObject *module, PyTypeObject *type)
-{
-    const char *name, *dot;
-
-    if (PyType_Ready(type) < 0) {
-        return -1;
-    }
-
-    // inline _PyType_Name()
-    name = type->tp_name;
-    assert(name != _Py_NULL);
-    dot = strrchr(name, '.');
-    if (dot != _Py_NULL) {
-        name = dot + 1;
-    }
-
-    return PyModule_AddObjectRef(module, name, _PyObject_CAST(type));
-}
-#endif
-
-
-// bpo-40241 added PyObject_GC_IsTracked() to Python 3.9.0a6.
-// bpo-4688 added _PyObject_GC_IS_TRACKED() to Python 2.7.0a2.
-#if PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
-static inline int PyObject_GC_IsTracked(PyObject* obj)
-{
-    return (PyObject_IS_GC(obj) && _PyObject_GC_IS_TRACKED(obj));
-}
-#endif
-
-// bpo-40241 added PyObject_GC_IsFinalized() to Python 3.9.0a6.
-// bpo-18112 added _PyGCHead_FINALIZED() to Python 3.4.0 final.
-#if PY_VERSION_HEX < 0x030900A6 && PY_VERSION_HEX >= 0x030400F0 && !defined(PYPY_VERSION)
-static inline int PyObject_GC_IsFinalized(PyObject *obj)
-{
-    PyGC_Head *gc = _Py_CAST(PyGC_Head*, obj) - 1;
-    return (PyObject_IS_GC(obj) && _PyGCHead_FINALIZED(gc));
-}
-#endif
-
-
-// bpo-39573 added Py_IS_TYPE() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
-static inline int _Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
-    return Py_TYPE(ob) == type;
-}
-#define Py_IS_TYPE(ob, type) _Py_IS_TYPE(_PyObject_CAST(ob), type)
-#endif
-
-
 // bpo-46906 added PyFloat_Pack2() and PyFloat_Unpack2() to Python 3.11a7.
 // bpo-11734 added _PyFloat_Pack2() and _PyFloat_Unpack2() to Python 3.6.0b1.
 // Python 3.11a2 moved _PyFloat_Pack2() and _PyFloat_Unpack2() to the internal
@@ -592,81 +420,6 @@ static inline Py_ssize_t PyVectorcall_NARGS(size_t n)
 #endif
 
 
-// gh-105922 added PyObject_Vectorcall() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4
-static inline PyObject*
-PyObject_Vectorcall(PyObject *callable, PyObject *const *args,
-                     size_t nargsf, PyObject *kwnames)
-{
-#if PY_VERSION_HEX >= 0x030800B1 && !defined(PYPY_VERSION)
-    // bpo-36974 added _PyObject_Vectorcall() to Python 3.8.0b1
-    return _PyObject_Vectorcall(callable, args, nargsf, kwnames);
-#else
-    PyObject *posargs = NULL, *kwargs = NULL;
-    PyObject *res;
-    Py_ssize_t nposargs, nkwargs, i;
-
-    if (nargsf != 0 && args == NULL) {
-        PyErr_BadInternalCall();
-        goto error;
-    }
-    if (kwnames != NULL && !PyTuple_Check(kwnames)) {
-        PyErr_BadInternalCall();
-        goto error;
-    }
-
-    nposargs = (Py_ssize_t)PyVectorcall_NARGS(nargsf);
-    if (kwnames) {
-        nkwargs = PyTuple_GET_SIZE(kwnames);
-    }
-    else {
-        nkwargs = 0;
-    }
-
-    posargs = PyTuple_New(nposargs);
-    if (posargs == NULL) {
-        goto error;
-    }
-    if (nposargs) {
-        for (i=0; i < nposargs; i++) {
-            PyTuple_SET_ITEM(posargs, i, Py_NewRef(*args));
-            args++;
-        }
-    }
-
-    if (nkwargs) {
-        kwargs = PyDict_New();
-        if (kwargs == NULL) {
-            goto error;
-        }
-
-        for (i = 0; i < nkwargs; i++) {
-            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
-            PyObject *value = *args;
-            args++;
-            if (PyDict_SetItem(kwargs, key, value) < 0) {
-                goto error;
-            }
-        }
-    }
-    else {
-        kwargs = NULL;
-    }
-
-    res = PyObject_Call(callable, posargs, kwargs);
-    Py_DECREF(posargs);
-    Py_XDECREF(kwargs);
-    return res;
-
-error:
-    Py_DECREF(posargs);
-    Py_XDECREF(kwargs);
-    return NULL;
-#endif
-}
-#endif
-
-
 // gh-106521 added PyObject_GetOptionalAttr() and
 // PyObject_GetOptionalAttrString() to Python 3.13.0a1
 #if PY_VERSION_HEX < 0x030D00A1
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 35511300f703e..77a22568a3fd3 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -689,7 +689,7 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
     return new_with_sizes(
         options, scalar_type, deviceOptional, r.symintlist(0));
   }
-  throw std::runtime_error("new(): invalid arguments");
+  TORCH_CHECK(false, "new(): invalid arguments");
 }
 
 // NB: device_idx here is NOT a DeviceIndex, but index into PythonArgs
@@ -808,7 +808,7 @@ static Tensor legacy_tensor_generic_ctor_new(
     return legacy_new_from_sequence(
         options, scalar_type, deviceOptional, r.pyobject(0));
   }
-  throw std::runtime_error("new(): invalid arguments");
+  TORCH_CHECK(false, "new(): invalid arguments");
 }
 
 // Handles ONLY torch.Tensor
@@ -1072,7 +1072,7 @@ static Tensor sparse_compressed_tensor_ctor_worker(
                values.options().layout(layout).pinned_memory(pin_memory))
         .set_requires_grad(r.toBool(ARG_REQUIRES_GRAD1));
   }
-  throw std::runtime_error(name + ": invalid arguments");
+  TORCH_CHECK(false, name + ": invalid arguments");
 }
 
 Tensor sparse_compressed_tensor_ctor(
@@ -1274,7 +1274,7 @@ Tensor sparse_coo_tensor_ctor(
                inferred_options.dtype(inferred_scalar_type).layout(at::kSparse))
         .set_requires_grad(r.toBool(ARG_REQUIRES_GRAD2));
   }
-  throw std::runtime_error("sparse_coo_tensor(): invalid arguments");
+  TORCH_CHECK(false, "sparse_coo_tensor(): invalid arguments");
 }
 
 void _validate_sparse_coo_tensor_args(
@@ -1504,7 +1504,7 @@ Tensor tensor_ctor(
     new_tensor.set_requires_grad(args_requires_grad);
     return new_tensor;
   }
-  throw std::runtime_error("tensor(): invalid arguments");
+  TORCH_CHECK(false, "tensor(): invalid arguments");
 }
 
 Tensor as_tensor(
@@ -1523,7 +1523,7 @@ Tensor as_tensor(
         /*copy_numpy=*/false,
         /*type_inference=*/type_inference);
   }
-  throw std::runtime_error("tensor(): invalid arguments");
+  TORCH_CHECK(false, "tensor(): invalid arguments");
 }
 
 Tensor new_tensor(
@@ -1561,7 +1561,7 @@ Tensor new_tensor(
     new_tensor.set_requires_grad(args_requires_grad);
     return new_tensor;
   }
-  throw std::runtime_error("new_tensor(): invalid arguments");
+  TORCH_CHECK(false, "new_tensor(): invalid arguments");
 }
 
 Tensor tensor_frombuffer(
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index b9839a79f6110..c6cd56711c4a1 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -9,32 +9,32 @@
 
 namespace torch::utils {
 PyObject* tensor_to_numpy(const at::Tensor&, bool) {
-  throw std::runtime_error("PyTorch was compiled without NumPy support");
+  TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
 }
 at::Tensor tensor_from_numpy(
     PyObject* obj,
     bool warn_if_not_writeable /*=true*/) {
-  throw std::runtime_error("PyTorch was compiled without NumPy support");
+  TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
 }
 
 bool is_numpy_available() {
-  throw std::runtime_error("PyTorch was compiled without NumPy support");
+  TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
 }
 
 bool is_numpy_int(PyObject* obj) {
-  throw std::runtime_error("PyTorch was compiled without NumPy support");
+  TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
 }
 bool is_numpy_scalar(PyObject* obj) {
-  throw std::runtime_error("PyTorch was compiled without NumPy support");
+  TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
 }
 at::Tensor tensor_from_cuda_array_interface(
     PyObject* obj,
     std::optional<c10::Device> device_opt) {
-  throw std::runtime_error("PyTorch was compiled without NumPy support");
+  TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
 }
 
 void warn_numpy_not_writeable() {
-  throw std::runtime_error("PyTorch was compiled without NumPy support");
+  TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
 }
 
 // No-op stubs.
@@ -215,9 +215,7 @@ void warn_numpy_not_writeable() {
 at::Tensor tensor_from_numpy(
     PyObject* obj,
     bool warn_if_not_writeable /*=true*/) {
-  if (!is_numpy_available()) {
-    throw std::runtime_error("Numpy is not available");
-  }
+  TORCH_CHECK(is_numpy_available(), "Numpy is not available");
   TORCH_CHECK_TYPE(
       PyArray_Check(obj),
       "expected np.ndarray (got ",
@@ -385,9 +383,7 @@ bool is_numpy_scalar(PyObject* obj) {
 at::Tensor tensor_from_cuda_array_interface(
     PyObject* obj,
     std::optional<c10::Device> device_opt) {
-  if (!is_numpy_available()) {
-    throw std::runtime_error("Numpy is not available");
-  }
+  TORCH_CHECK(is_numpy_available(), "Numpy is not available");
   auto cuda_dict =
       THPObjectPtr(PyObject_GetAttrString(obj, "__cuda_array_interface__"));
   TORCH_INTERNAL_ASSERT(cuda_dict);
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
index d49fc0539a087..8f1aead1900c6 100644
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@@ -415,6 +415,11 @@ static void initXpuMethodBindings(PyObject* module) {
         return std::make_tuple(
             stream.id(), stream.device_index(), stream.device_type());
       });
+  m.def(
+      "_xpu_canDeviceAccessPeer",
+      [](c10::DeviceIndex device, c10::DeviceIndex peer) {
+        return at::xpu::canDeviceAccessPeer(device, peer);
+      });
 }
 
 // Callback for python part. Used for additional initialization of python
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 01bc4d73a4595..2d7b7d8cf8b5d 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -25,7 +25,7 @@
 from torch import device as _device
 from torch._utils import _dummy_type, _LazySeedTracker, classproperty
 
-from . import gds
+from . import _device_limits, gds
 from ._utils import _get_device_index
 from .graphs import (
     CUDAGraph,
@@ -259,7 +259,7 @@ def _check_capability():
     CUDA_ARCHES_SUPPORTED = {
         "12.6": {"min": 50, "max": 90},
         "12.8": {"min": 70, "max": 120},
-        "12.9": {"min": 70, "max": 120},
+        "13.0": {"min": 75, "max": 120},
     }
 
     if (
@@ -500,6 +500,7 @@ def __init__(self, code: int) -> None:
 
 
 def check_error(res: int) -> None:
+    r"""Raise an error if the result of a CUDA runtime API call is not success."""
     if res != _cudart.cudaError.success:
         raise CudaError(res)
 
@@ -1732,7 +1733,6 @@ def _compile_kernel(
     kernel_source: str,
     kernel_name: str,
     compute_capability: Optional[str] = None,
-    header_code: str = "",
     cuda_include_dirs: Optional[list] = None,
     nvcc_options: Optional[list] = None,
 ):
@@ -1749,7 +1749,6 @@ def _compile_kernel(
         kernel_name (str): The name of the kernel function to compile
         compute_capability (str, optional): The compute capability to target (e.g., "86").
                                            If None, will detect from current device.
-        header_code (str, optional): Additional header code to prepend to the kernel source
         cuda_include_dirs (list, optional): List of directories containing CUDA headers
         nvcc_options (list, optional): Additional options to pass to NVRTC
 
@@ -1772,29 +1771,26 @@ def _compile_kernel(
         >>> c = torch.empty_like(a)
         >>> add_kernel(grid=(4, 1, 1), block=(256, 1, 1), args=[a, b, c, a.numel()])
     """
-    import ctypes
-
     from torch.cuda._utils import _cuda_load_module, _nvrtc_compile
 
     # Compile the kernel to PTX
-    ptx = _nvrtc_compile(
+    ptx, mangled_name = _nvrtc_compile(
         kernel_source,
         kernel_name,
         compute_capability,
-        header_code,
         cuda_include_dirs,
         nvcc_options,
     )
 
     # Load the module and get the kernel
-    result = _cuda_load_module(ptx, [kernel_name])
+    result = _cuda_load_module(ptx, [mangled_name])
 
     if isinstance(result, dict):
-        return result[kernel_name]
+        return result[mangled_name]
     else:
         # This branch shouldn't be executed if kernel_names is provided,
         # but MyPy needs this to understand type narrowing
-        return getattr(result, kernel_name)
+        return getattr(result, mangled_name)
 
 
 from . import amp, jiterator, nvtx, profiler, sparse, tunable
diff --git a/torch/cuda/_device_limits.py b/torch/cuda/_device_limits.py
new file mode 100644
index 0000000000000..808d748c8f6eb
--- /dev/null
+++ b/torch/cuda/_device_limits.py
@@ -0,0 +1,140 @@
+import torch
+from torch._C import dtype
+
+
+__all__ = ["GPULimits"]
+
+
+class GPULimits:
+    r"""Utility class that provides the theoretical limits of Nvidia GPU devices. The
+    limits don't take into account thermal throttling (assume that the GPU run at its
+    peak rated frequency). This is because user hardware configuration may influence
+    power behavior.
+    """
+
+    def __init__(self, target_device: torch.device):
+        # The device properties object is obtained by calling 'cudaGetDeviceProperties' CUDA
+        # runtime function. We need the total memory bus width and the memory clock rate to
+        # calculate the memory bandwidth.
+        self.device_properties = torch.cuda.get_device_properties(target_device)
+
+        # The compute capability is needed to determine the number of FLOPs per cycle per SM
+        self.compute_capability = int(
+            f"{self.device_properties.major}{self.device_properties.minor}"
+        )
+
+    # FLOPs per cycle information derived from Table 2 in:
+    # https://resources.nvidia.com/en-us-hopper-architecture/nvidia-h100-tensor-c
+
+    # Returns the number of FMA instructions retired per cycle per SM for a given
+    # data type, when tensor cores are NOT used
+    def get_fma_per_cycle_per_sm_cuda_cores(self, data_type: dtype) -> int:
+        hardcoded_device_values = {
+            # Ampere Architecture
+            "fp16_80": 256,
+            "fp32_80": 64,
+            "fp64_80": 32,
+            # Hopper Architecture
+            "fp16_90": 64,
+            "fp32_90": 128,
+            "fp64_90": 64,
+            # Blackwell Architecture
+            "fp16_100": 256,
+            "fp32_100": 128,
+            "fp64_100": 64,
+        }
+        dict_key = ""
+        if data_type is torch.float16:
+            dict_key = f"fp16_{self.compute_capability}"
+        elif data_type is torch.float32:
+            dict_key = f"fp32_{self.compute_capability}"
+        elif data_type is torch.float64:
+            dict_key = f"fp64_{self.compute_capability}"
+        else:
+            dict_key = "unknown"
+
+        if dict_key not in hardcoded_device_values.keys():
+            raise RuntimeError(
+                f"No data for sm_{self.compute_capability} and {data_type}."
+            )
+
+        return hardcoded_device_values[dict_key]
+
+    # Returns the number of FMA instructions retired per cycle per SM for a given
+    # data type, when tensor cores ARE used
+    def get_fma_per_cycle_per_sm_tensor_cores(self, data_type: dtype) -> int:
+        hardcoded_device_values = {
+            # Ampere Architecture
+            "int8_80": 2048,
+            "fp16_80": 1024,
+            "fp32_80": 512,
+            "fp64_80": 64,
+            # Hopper Architecture
+            "int8_90": 4096,
+            "fp8_90": 4096,
+            "fp16_90": 2048,
+            "fp32_90": 1024,
+            "fp64_90": 128,
+            # Blackwell Architecture
+            "int8_100": 8192,
+            "fp8_100": 8192,
+            "fp16_100": 4096,
+            "fp32_100": 2048,
+        }
+        dict_key = ""
+        if data_type is torch.float16:
+            dict_key = f"fp16_{self.compute_capability}"
+        elif data_type is torch.bfloat16:
+            # FP16 and BF16 are equivalent in terms of FLOPs per cycle per SM
+            dict_key = f"fp16_{self.compute_capability}"
+        elif data_type is torch.float32:
+            dict_key = f"fp32_{self.compute_capability}"
+        elif data_type is torch.int8:
+            dict_key = f"int8_{self.compute_capability}"
+        elif data_type is torch.float64:
+            dict_key = f"fp64_{self.compute_capability}"
+        else:
+            dict_key = "unknown"
+
+        if dict_key not in hardcoded_device_values.keys():
+            raise RuntimeError(
+                f"No data for sm_{self.compute_capability} and {data_type}."
+            )
+
+        return hardcoded_device_values[dict_key]
+
+    def get_tflops_per_second(
+        self, data_type: dtype, use_tensor_cores: bool = True
+    ) -> float:
+        num_sms = self.device_properties.multi_processor_count
+        clock_rate = self.device_properties.clock_rate  # KHz
+
+        fma_per_cycle = 0
+        if use_tensor_cores:
+            fma_per_cycle = self.get_fma_per_cycle_per_sm_tensor_cores(data_type)
+        else:
+            fma_per_cycle = self.get_fma_per_cycle_per_sm_cuda_cores(data_type)
+
+        # 1 FMA counts as 2 floating point operations
+        # Clock rate is in KHz
+        tflops_per_second = num_sms * fma_per_cycle * 2 * clock_rate / 1e9
+        return tflops_per_second
+
+    def get_memory_bandwidth_Bps(self) -> int:
+        # DRAM devices are Double-Data which means they provide an output at both fronts of
+        # a clock beat
+        bus_bytes_per_cycle = int(2 * self.device_properties.memory_bus_width / 8)
+        mem_clock_rate_Hz = self.device_properties.memory_clock_rate * 1000
+        bytes_per_second = bus_bytes_per_cycle * mem_clock_rate_Hz * 2
+        return bytes_per_second
+
+    def get_shared_memory_bandwidth_Bps(self) -> int:
+        # Each warp can LD or ST 32 x 4 bytes per cycle. To calculate the
+        # device's throughput we need to multiply with frequency and number of SMs.
+        num_sms = self.device_properties.multi_processor_count
+        bytes_per_cycle_per_sm = 128
+        bytes_per_cycle_per_device = num_sms * bytes_per_cycle_per_sm
+        bytes_per_second = (
+            bytes_per_cycle_per_device * self.device_properties.clock_rate * 1000
+        )
+        return bytes_per_second
diff --git a/torch/cuda/_utils.py b/torch/cuda/_utils.py
index 5fdcd65ddf7b7..20d4e0a71d75f 100644
--- a/torch/cuda/_utils.py
+++ b/torch/cuda/_utils.py
@@ -8,20 +8,40 @@
 from torch._utils import _get_device_index as _torch_get_device_index
 
 
-# Load CUDA driver and NVRTC
-def _get_cuda_library() -> ctypes.CDLL:
+def _get_hip_runtime_library() -> ctypes.CDLL:
+    if sys.platform == "win32":
+        lib = ctypes.CDLL(f"amdhip64_{torch.version.hip[0]}.dll")
+    else:  # Unix-based systems
+        lib = ctypes.CDLL("libamdhip64.so")
+    lib.cuGetErrorString = lib.hipGetErrorString  # type: ignore[attr-defined]
+    lib.cuModuleLoadData = lib.hipModuleLoadData  # type: ignore[attr-defined]
+    lib.cuModuleGetFunction = lib.hipModuleGetFunction  # type: ignore[attr-defined]
+    lib.cuLaunchKernel = lib.hipModuleLaunchKernel  # type: ignore[attr-defined]
+    lib.cuFuncSetAttribute = lib.hipFuncSetAttribute  # type: ignore[attr-defined]
+    return lib
+
+
+def _get_cuda_runtime_library() -> ctypes.CDLL:
     if sys.platform == "win32":
         return ctypes.CDLL("nvcuda.dll")
     else:  # Unix-based systems
         return ctypes.CDLL("libcuda.so.1")
 
 
+# Load GPU driver runtime
+def _get_gpu_runtime_library() -> ctypes.CDLL:
+    if torch.version.hip:
+        return _get_hip_runtime_library()
+    else:
+        return _get_cuda_runtime_library()
+
+
 # Helper: check CUDA errors
 def _check_cuda(result: int) -> None:
     if result == 0:
         return
     err_str = ctypes.c_char_p()
-    libcuda = _get_cuda_library()  # Get reference to CUDA library
+    libcuda = _get_gpu_runtime_library()  # Get reference to CUDA library
     libcuda.cuGetErrorString(result, ctypes.byref(err_str))
     error_message = (
         err_str.value.decode() if err_str.value is not None else "Unknown CUDA error"
@@ -29,23 +49,87 @@ def _check_cuda(result: int) -> None:
     raise RuntimeError(f"CUDA error: {error_message}")
 
 
+def _get_hiprtc_library() -> ctypes.CDLL:
+    if sys.platform == "win32":
+        version_str = "".join(["0", torch.version.hip[0], "0", torch.version.hip[2]])
+        lib = ctypes.CDLL(f"hiprtc{version_str}.dll")
+    else:
+        lib = ctypes.CDLL("libhiprtc.so")
+
+    # Provide aliases for HIP RTC functions to match NVRTC API
+    lib.nvrtcGetErrorString = lib.hiprtcGetErrorString  # type: ignore[attr-defined]
+    lib.nvrtcCreateProgram = lib.hiprtcCreateProgram  # type: ignore[attr-defined]
+    lib.nvrtcDestroyProgram = lib.hiprtcDestroyProgram  # type: ignore[attr-defined]
+    lib.nvrtcCompileProgram = lib.hiprtcCompileProgram  # type: ignore[attr-defined]
+    lib.nvrtcGetPTXSize = lib.hiprtcGetCodeSize  # type: ignore[attr-defined]
+    lib.nvrtcGetPTX = lib.hiprtcGetCode  # type: ignore[attr-defined]
+    lib.nvrtcGetProgramLogSize = lib.hiprtcGetProgramLogSize  # type: ignore[attr-defined]
+    lib.nvrtcGetProgramLog = lib.hiprtcGetProgramLog  # type: ignore[attr-defined]
+    lib.nvrtcAddNameExpression = lib.hiprtcAddNameExpression  # type: ignore[attr-defined]
+    lib.nvrtcGetLoweredName = lib.hiprtcGetLoweredName  # type: ignore[attr-defined]
+    return lib
+
+
 def _get_nvrtc_library() -> ctypes.CDLL:
-    # Since PyTorch already loads NVRTC, we can use the system library
-    # which should be compatible with PyTorch's version
+    major_version = int(torch.version.cuda.split(".")[0])  # type: ignore[union-attr]
     if sys.platform == "win32":
-        return ctypes.CDLL("nvrtc64_120_0.dll")
+        nvrtc_libs = [
+            f"nvrtc64_{major_version}0_0.dll",
+        ]
     else:
-        return ctypes.CDLL("libnvrtc.so")
+        nvrtc_libs = [
+            f"libnvrtc.so.{major_version}",
+            "libnvrtc.so",  # Fallback to unversioned
+        ]
+    for lib_name in nvrtc_libs:
+        try:
+            return ctypes.CDLL(lib_name)
+        except OSError:
+            continue
+    raise OSError("Could not find any NVRTC library")
+
+
+def _get_gpu_rtc_library() -> ctypes.CDLL:
+    # Since PyTorch already loads the GPU RTC library, we can use the system library
+    # which should be compatible with PyTorch's version
+    if torch.version.hip:
+        return _get_hiprtc_library()
+    else:
+        return _get_nvrtc_library()
+
+
+def _get_gpu_rtc_compatible_flags() -> list[str]:
+    """
+    Get HIPCC/NVCC flags that are compatible with NVRTC compilation.
+
+    Returns:
+        List of HIPCC/NVCC flags that can be safely used with NVRTC.
+    """
+    from torch.utils.cpp_extension import COMMON_HIPCC_FLAGS, COMMON_NVCC_FLAGS
+
+    nvrtc_unsupported_flags = {
+        "--expt-relaxed-constexpr",
+    }
+
+    # Filter out unsupported flags
+    compatible_flags = [
+        flag for flag in COMMON_NVCC_FLAGS if flag not in nvrtc_unsupported_flags
+    ]
+
+    if torch.version.hip:
+        compatible_flags.extend(COMMON_HIPCC_FLAGS)
+
+    return compatible_flags
 
 
 def _nvrtc_compile(
     kernel_source: str,
     kernel_name: str,
     compute_capability: Optional[str] = None,
-    header_code: str = "",
     cuda_include_dirs: Optional[list] = None,
     nvcc_options: Optional[list] = None,
-) -> bytes:
+    auto_pch: bool = False,
+) -> tuple[bytes, str]:
     """
     Compiles a CUDA kernel using NVRTC and returns the PTX code.
 
@@ -54,18 +138,18 @@ def _nvrtc_compile(
         kernel_name (str): The name of the kernel function to compile
         compute_capability (str, None): The compute capability to target (e.g., "86").
                                            If None, will detect from current device.
-        header_code (str, optional): Additional header code to prepend to the kernel source
         cuda_include_dirs (list, None): List of directories containing CUDA headers
         nvcc_options (list, None): Additional options to pass to NVRTC
+        auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)
 
     Returns:
-        str: The compiled PTX code
+        Tuple[bytes, str]: The compiled PTX code and mangled kernel name
     """
     # Ensure CUDA is initialized
     import torch.cuda
 
     # Load NVRTC library
-    libnvrtc = _get_nvrtc_library()
+    libnvrtc = _get_gpu_rtc_library()
 
     # NVRTC constants
     NVRTC_SUCCESS = 0
@@ -82,45 +166,49 @@ def check_nvrtc(result: int) -> None:
             )
             raise RuntimeError(f"CUDA error: {error_message}")
 
-    # Add 'extern "C"' if not already present to ensure C linkage
-    if not kernel_source.strip().startswith('extern "C"'):
-        kernel_source = f'extern "C" {kernel_source}'
-
-    # Combine header code and kernel source
-    if header_code:
-        full_source = header_code + "\n" + kernel_source
-    else:
-        full_source = kernel_source
-
     # Convert source to bytes
-    source_bytes = full_source.encode("utf-8")
+    source_bytes = kernel_source.encode("utf-8")
 
     # Get compute capability if not provided
     if compute_capability is None:
         props = torch.cuda.get_device_properties(torch.cuda.current_device())
-        compute_capability = f"{props.major}{props.minor}"
+        if torch.version.hip:
+            compute_capability = f"{props.gcnArchName}"
+        else:
+            compute_capability = f"{props.major}{props.minor}"
 
     # Prepare compilation options
     options = []
-    options.append(f"--gpu-architecture=sm_{compute_capability}".encode())
+    if torch.version.hip:
+        options.append(f"--offload-arch={compute_capability}".encode())
+    else:
+        options.append(f"--gpu-architecture=sm_{compute_capability}".encode())
+
+    # Auto-detect and add CUDA include paths
+    from torch.utils.cpp_extension import include_paths
+
+    cuda_include_paths = include_paths("cuda")
+    for cuda_path in cuda_include_paths:
+        options.append(f"-I{cuda_path}".encode())
 
     # Add custom include directories
     if cuda_include_dirs:
         for directory in cuda_include_dirs:
             options.append(f"-I{directory}".encode())
 
+    # Enable automatic precompiled headers (CUDA 12.8+)
+    if auto_pch:
+        assert str(torch.version.cuda) >= "12.8", "PCH requires CUDA 12.8+"
+        if nvcc_options is None:
+            nvcc_options = []
+        nvcc_options.append("--pch")
+
     # Add custom NVCC options
     if nvcc_options:
         for option in nvcc_options:
             options.append(option.encode("utf-8"))
 
-    # TODO: Should we refactor flags into a common place?
-    from torch.utils.cpp_extension import COMMON_NVCC_FLAGS
-
-    # Filter out flags not supported by NVRTC
-    nvrtc_compatible_flags = [
-        flag for flag in COMMON_NVCC_FLAGS if flag != "--expt-relaxed-constexpr"
-    ]
+    nvrtc_compatible_flags = _get_gpu_rtc_compatible_flags()
     options.extend([flag.encode("utf-8") for flag in nvrtc_compatible_flags])
 
     # Convert options to C array
@@ -140,6 +228,10 @@ def check_nvrtc(result: int) -> None:
         )
     )
 
+    # Add kernel name, which can be a template expression
+    c_kernel_name = kernel_name.encode("utf-8")
+    check_nvrtc(libnvrtc.nvrtcAddNameExpression(prog, c_kernel_name))
+
     # Compile program
     res = libnvrtc.nvrtcCompileProgram(prog, num_options, options_array)
 
@@ -157,9 +249,24 @@ def check_nvrtc(result: int) -> None:
     check_nvrtc(libnvrtc.nvrtcGetPTXSize(prog, ctypes.byref(ptx_size)))
     ptx = ctypes.create_string_buffer(ptx_size.value)
     check_nvrtc(libnvrtc.nvrtcGetPTX(prog, ptx))
+
+    # Get mangled name
+    c_mangled_name = ctypes.c_char_p()
+    check_nvrtc(
+        libnvrtc.nvrtcGetLoweredName(prog, c_kernel_name, ctypes.byref(c_mangled_name))
+    )
+    if c_mangled_name.value is not None:
+        mangled_name = c_mangled_name.value.decode()  # make a copy
+    else:
+        mangled_name = ""
+
     libnvrtc.nvrtcDestroyProgram(ctypes.byref(prog))
 
-    return ptx.value
+    # For HIP, hipRTC generates raw CO binaries instead of PTX,
+    # and for some reason, ".value" causes the string to be truncated,
+    # likely due to the presence of '\0' in the string. So we use .raw instead.
+    ptx_bytes = ptx.raw if torch.version.hip else ptx.value
+    return ptx_bytes, mangled_name
 
 
 class _CudaModule:
@@ -172,9 +279,9 @@ def __getattr__(self, name: str) -> "_CudaKernel":
             return self._kernels[name]
 
         # Import the CUDA library inside the method
-        from torch.cuda._utils import _get_cuda_library
+        from torch.cuda._utils import _get_gpu_runtime_library
 
-        libcuda = _get_cuda_library()
+        libcuda = _get_gpu_runtime_library()
 
         func = ctypes.c_void_p()
         try:
@@ -199,6 +306,7 @@ class _CudaKernel:
     def __init__(self, func: ctypes.c_void_p, module: ctypes.c_void_p) -> None:
         self.func = func
         self.module = module
+        self._max_shared_mem_bytes = 0
 
     def __call__(
         self,
@@ -221,7 +329,7 @@ def __call__(
         """
         import torch
 
-        libcuda = torch.cuda._utils._get_cuda_library()
+        libcuda = torch.cuda._utils._get_gpu_runtime_library()
 
         if not args:
             args = []
@@ -245,12 +353,11 @@ def __call__(
                 c_int = ctypes.c_int(arg)
                 # Store the C int for reference keeping, not in processed_args
                 c_args.append(ctypes.byref(c_int))
-            # TODO: Python floats are actually doubles
             elif isinstance(arg, float):
-                # Convert floats to C float
-                c_float = ctypes.c_float(arg)
-                # Store the C float for reference keeping, not in processed_args
-                c_args.append(ctypes.byref(c_float))
+                # Python floats are doubles - use double by default
+                c_double = ctypes.c_double(arg)
+                # Store the C double for reference keeping, not in processed_args
+                c_args.append(ctypes.byref(c_double))
             else:
                 raise TypeError(f"Unsupported argument type: {type(arg)}")
 
@@ -266,6 +373,22 @@ def __call__(
 
             stream = torch.cuda.current_stream()
 
+        # Check if kernel requires large shared memory but hasn't been configured
+        if shared_mem >= 48 * 1024 and (
+            self._max_shared_mem_bytes == 0 or shared_mem > self._max_shared_mem_bytes
+        ):
+            configured_msg = (
+                "not configured"
+                if self._max_shared_mem_bytes == 0
+                else f"only {self._max_shared_mem_bytes} bytes configured"
+            )
+            raise RuntimeError(
+                f"Kernel requires {shared_mem} bytes of shared memory (>= 48KB), "
+                f"but {configured_msg}. "
+                "Call kernel.set_shared_memory_config(shared_mem) after compilation "
+                "and before launching the kernel."
+            )
+
         _check_cuda(
             libcuda.cuLaunchKernel(
                 self.func,
@@ -282,6 +405,48 @@ def __call__(
             )
         )
 
+    def set_shared_memory_config(self, shared_mem_bytes: int) -> None:
+        if shared_mem_bytes < 48 * 1024:
+            # No configuration needed for <= 48KB, just update the value
+            self._max_shared_mem_bytes = shared_mem_bytes
+            return
+
+        libcuda = _get_gpu_runtime_library()
+
+        # Get device properties to validate against limits
+        device_props = torch.cuda.get_device_properties()
+        # HIP doesn't have shared_memory_per_block_optin in device properties, so we hard-code it here
+        if torch.version.hip:
+            # navi, CDNA1-CDNA3 allows a max of 64KB shared memory
+            # CDNA4 allows a max of 160KB shared memory
+            max_shared_mem = (
+                65536 if device_props.gcnArchName not in ["gfx950"] else 160 * 1024
+            )
+        else:
+            max_shared_mem = getattr(
+                device_props, "shared_memory_per_block_optin", 49152
+            )
+
+        if shared_mem_bytes > max_shared_mem:
+            raise RuntimeError(
+                f"Requested shared memory ({shared_mem_bytes} bytes) exceeds "
+                f"device limit ({max_shared_mem} bytes). "
+                "Consider reducing block size or shared memory usage."
+            )
+
+        # Set the function attribute once
+        # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
+        cudaFuncAttributeMaxDynamicSharedMemorySize = 8
+        _check_cuda(
+            libcuda.cuFuncSetAttribute(
+                self.func,
+                cudaFuncAttributeMaxDynamicSharedMemorySize,
+                shared_mem_bytes,
+            )
+        )
+
+        self._max_shared_mem_bytes = shared_mem_bytes
+
 
 def _cuda_load_module(
     ptx: Union[str, bytes], kernel_names: Optional[list[str]] = None
@@ -302,7 +467,7 @@ def _cuda_load_module(
     import torch.cuda
 
     # Load CUDA driver library
-    libcuda = _get_cuda_library()
+    libcuda = _get_gpu_runtime_library()
 
     # Convert PTX to bytes if it's a string
     if isinstance(ptx, str):
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 5a1a0adc02afc..5a3ef458e0bc7 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -862,7 +862,7 @@ def _record_memory_history_legacy(
 
 
 def _record_memory_history(
-    enabled: Literal[None, "state", "all"] = "all", *args, **kwargs
+    enabled: Optional[Literal["state", "all"]] = "all", *args, **kwargs
 ) -> None:
     """Enable recording of stack traces associated with memory
     allocations, so you can tell what allocated any piece of memory in
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 023f5f9a53b22..9b7a3deb7d815 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -116,6 +116,16 @@ def __hash__(self):
     def __repr__(self):
         return f"<torch.cuda.Stream device={self.device} cuda_stream={self.cuda_stream:#x}>"
 
+    def __cuda_stream__(self):
+        """Implements the CUDA Stream Protocol:
+        https://nvidia.github.io/cuda-python/cuda-core/latest/interoperability.html#cuda-stream-protocol
+
+        Returns:
+            tuple: A 2-tuple of (version, handle) where version is the protocol version
+                   and handle is the address of cudaStream_t (CUDA) or hipStream_t (ROCm) as a Python int.
+        """
+        return (0, self.cuda_stream)
+
 
 class ExternalStream(Stream):
     r"""Wrapper around an externally allocated CUDA stream.
diff --git a/torch/cuda/tunable.py b/torch/cuda/tunable.py
index c3982c33315e2..a1fbd4fdddc27 100644
--- a/torch/cuda/tunable.py
+++ b/torch/cuda/tunable.py
@@ -591,7 +591,6 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
         transA = layout[1] == "T"
         dtype = dtype_dict.get(data_type)
         if data_type == "tf32":
-            # User must still set HIPBLASLT_ALLOW_TF32=1
             torch.backends.cuda.matmul.allow_tf32 = True
         else:
             torch.backends.cuda.matmul.allow_tf32 = False
@@ -601,7 +600,7 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
         assert count in [6, 7]
         untuned_gemm_temp = untuned_gemm[0].split("_")
         # dtypeC = might not be FP8 type, keep track
-        # of the the number of underscores
+        # of the number of underscores
         op_sig = untuned_gemm_temp[0]
         data_typeA = untuned_gemm_temp[1] + "_" + untuned_gemm_temp[2]
         data_typeB = untuned_gemm_temp[3] + "_" + untuned_gemm_temp[4]
diff --git a/torch/distributed/_mesh_layout.py b/torch/distributed/_mesh_layout.py
new file mode 100644
index 0000000000000..86969fccc55d5
--- /dev/null
+++ b/torch/distributed/_mesh_layout.py
@@ -0,0 +1,71 @@
+"""
+Definition of CuTe inspired Layouts for DeviceMesh internal bookkeeping and functions to manipulate them
+"""
+
+import math
+from collections.abc import Iterator
+from dataclasses import dataclass
+
+from torch.distributed._pycute import (
+    coalesce,
+    complement,
+    composition,
+    flatten,
+    IntTuple,
+    is_int,
+    is_tuple,
+    Layout,
+)
+
+
+@dataclass(frozen=True, init=True)
+class _MeshLayout(Layout):
+    shape: IntTuple
+    stride: IntTuple
+
+    def __post_init__(self) -> None:
+        if not is_tuple(self.shape) and not is_int(self.shape):
+            raise TypeError(f"shape must be a tuple or int, got {type(self.shape)}")
+        if not is_tuple(self.stride) and not is_int(self.stride):
+            raise TypeError(f"stride must be a tuple or int, got {type(self.stride)}")
+        if (
+            is_tuple(self.shape)
+            and is_tuple(self.stride)
+            and len(flatten(self.shape)) != len(flatten(self.stride))
+        ):
+            raise ValueError(
+                f"sizes {len(flatten(self.shape))} and "
+                f"strides {len(flatten(self.stride))} must have the same length"
+            )
+
+    @property
+    def sizes(self) -> IntTuple:
+        return self.shape
+
+    @property
+    def strides(self) -> IntTuple:
+        return self.stride
+
+    @property
+    def sizes_and_strides(self) -> Iterator[tuple[int, int]]:
+        return zip(flatten(self.shape), flatten(self.stride))
+
+    def numel(self) -> int:
+        return math.prod(flatten(self.shape))
+
+    # # operator []    (get-i like tuples)
+    def __getitem__(self, i: int) -> "_MeshLayout":
+        layout = super().__getitem__(i)
+        return _MeshLayout(layout.shape, layout.stride)
+
+    def coalesce(self) -> "_MeshLayout":
+        layout = coalesce(self)
+        return _MeshLayout(layout.shape, layout.stride)
+
+    def composition(self, layout: "_MeshLayout") -> "_MeshLayout":
+        result = composition(self, layout)
+        return _MeshLayout(result.shape, result.stride)
+
+    def complement(self, world_size: int) -> "_MeshLayout":
+        layout = complement(self, world_size)
+        return _MeshLayout(layout.shape, layout.stride)
diff --git a/torch/distributed/_pycute/__init__.py b/torch/distributed/_pycute/__init__.py
new file mode 100644
index 0000000000000..9dbd35a445330
--- /dev/null
+++ b/torch/distributed/_pycute/__init__.py
@@ -0,0 +1,72 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from .int_tuple import (
+    crd2crd,
+    crd2idx,
+    elem_scale,
+    flatten,
+    has_none,
+    idx2crd,
+    inner_product,
+    IntTuple,
+    is_int,
+    is_tuple,
+    product,
+    shape_div,
+    signum,
+    slice_,
+    suffix_product,
+    tuple_max,
+)
+from .layout import (
+    coalesce,
+    complement,
+    composition,
+    cosize,
+    filter,
+    is_layout,
+    Layout,
+    LayoutBase,
+    left_inverse,
+    logical_divide,
+    logical_product,
+    make_layout,
+    right_inverse,
+    size,
+    slice_and_offset,
+    tiled_divide,
+    tiled_product,
+    zipped_divide,
+    zipped_product,
+)
+from .typing import Integer
diff --git a/torch/distributed/_pycute/int_tuple.py b/torch/distributed/_pycute/int_tuple.py
new file mode 100644
index 0000000000000..3a6f171d5a240
--- /dev/null
+++ b/torch/distributed/_pycute/int_tuple.py
@@ -0,0 +1,255 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Functions for manipulating IntTuples
+"""
+
+from functools import reduce
+from itertools import chain
+from typing import Optional, Union
+from typing_extensions import TypeAlias, TypeIs
+
+from .typing import Integer
+
+
+# Type aliases for better readability
+IntTuple: TypeAlias = Union[int, tuple["IntTuple", ...]]
+
+
+def is_int(x: object) -> TypeIs[int]:
+    return isinstance(x, Integer)
+
+
+def is_tuple(x: object) -> TypeIs[tuple]:
+    return isinstance(x, tuple)
+
+
+def flatten(t: IntTuple) -> tuple[int, ...]:
+    if is_tuple(t):
+        if len(t) == 0:
+            return ()
+        else:
+            return tuple(i for a in t for i in flatten(a))
+    else:
+        return (t,)
+
+
+def signum(a: int) -> int:
+    return bool(a > 0) - bool(a < 0)
+
+
+def product(a: IntTuple) -> int:
+    if is_tuple(a):
+        return reduce(lambda val, elem: val * product(elem), a, 1)
+    else:
+        return a
+
+
+def inner_product(a: IntTuple, b: IntTuple) -> int:
+    if is_tuple(a) and is_tuple(b):  # tuple tuple
+        assert len(a) == len(b)
+        return sum(inner_product(x, y) for x, y in zip(a, b))
+    else:  # "int" "int"
+        assert not is_tuple(a) and not is_tuple(b)
+        return a * b
+
+
+def tuple_max(a: IntTuple) -> int:
+    if is_tuple(a):
+        return max(tuple_max(x) for x in a)
+    else:
+        return a
+
+
+def elem_scale(a: IntTuple, b: IntTuple) -> IntTuple:
+    if is_tuple(a):
+        if is_tuple(b):  # tuple tuple
+            assert len(a) == len(b)
+            return tuple(elem_scale(x, y) for x, y in zip(a, b))
+        else:  # tuple "int"
+            raise AssertionError("Invalid combination: tuple with int")
+    else:
+        if is_tuple(b):  # "int" tuple
+            return elem_scale(a, product(b))
+        else:  # "int" "int"
+            return a * b
+
+
+# Inclusive prefix ceil div with output congruent to input a
+def shape_div(a: IntTuple, b: IntTuple) -> IntTuple:
+    if is_tuple(a):
+        if is_tuple(b):  # tuple tuple
+            assert len(a) == len(b)
+            return tuple(shape_div(x, y) for x, y in zip(a, b))
+        else:  # tuple "int"
+            # r = [shape_div(a[0],b)] + [shape_div(a[i],b := shape_div(b, product(a[i-1]))) for i in range(1,len(a))]
+            r = []
+            for v in a:
+                r.append(shape_div(v, b))
+                b = shape_div(b, product(v))
+            return tuple(r)
+    else:
+        if is_tuple(b):  # "int" tuple
+            return shape_div(a, product(b))
+        else:  # "int" "int"
+            assert a % b == 0 or b % a == 0
+            return (a + b - 1) // b
+
+
+# Exclusive suffix product with output congruent to input a (lexicographic)
+def suffix_product(a: IntTuple, init: IntTuple = 1) -> IntTuple:
+    # TODO: With all these length asserts, may want to create a zip_strict wrapper.
+    if is_tuple(a):
+        if is_tuple(init):  # tuple tuple
+            assert len(a) == len(init)
+            return tuple(suffix_product(x, i) for x, i in zip(a, init))
+        else:  # tuple "int"
+            # Process from right to left for lexicographic ordering
+            # r = [prefix_product(a[len(a)-1],init)] +
+            # [prefix_product(a[i],init := init * product(a[i+1])) for i in range(len(a)-1,0)].reverse()
+            r = []
+
+            # Calculate products from right to left, appending to list
+            for i in range(len(a) - 1, -1, -1):
+                r.append(suffix_product(a[i], init))
+                init = init * product(a[i])
+
+            # Reverse to get correct lexicographic order
+            r.reverse()
+            return tuple(r)
+    else:
+        if is_tuple(init):  # "int" tuple
+            raise AssertionError("Invalid combination: int with tuple init")
+        else:  # "int" "int"
+            return init
+
+
+def idx2crd(
+    idx: IntTuple, shape: IntTuple, stride: Optional[IntTuple] = None
+) -> IntTuple:
+    if stride is None:
+        stride = suffix_product(shape)
+
+    if is_tuple(idx):
+        if is_tuple(shape) and is_tuple(stride):  # tuple tuple tuple
+            assert len(idx) == len(shape) and len(stride) == len(shape)
+            return tuple(idx2crd(i, s, d) for i, s, d in zip(idx, shape, stride))
+        else:  # tuple "int" "int"
+            raise AssertionError("Invalid combination: tuple with int stride")
+    else:
+        if is_tuple(shape) and is_tuple(stride):  # "int" tuple tuple
+            assert len(shape) == len(stride)
+            return tuple(idx2crd(idx, s, d) for s, d in zip(shape, stride))
+        else:  # "int" "int" "int"
+            assert not is_tuple(shape) and not is_tuple(stride)
+            return (idx // stride) % shape  # all are ints after type checks
+
+
+def crd2idx(
+    crd: Optional[IntTuple], shape: IntTuple, stride: Optional[IntTuple] = None
+) -> int:
+    if stride is None:
+        stride = suffix_product(shape)
+
+    if is_tuple(crd):
+        if is_tuple(shape) and is_tuple(stride):  # tuple tuple tuple
+            assert len(crd) == len(shape) and len(stride) == len(shape)
+            return sum(crd2idx(c, s, d) for c, s, d in zip(crd, shape, stride))
+        else:  # tuple "int" "int"
+            raise AssertionError(f"Invalid combination: crd={crd}, shape={shape}")
+    else:
+        if crd is None:
+            crd = 0
+
+        if is_tuple(shape) and is_tuple(stride):  # "int" tuple tuple
+            assert len(shape) == len(stride)
+            result = 0
+            # Process from right to left for lexicographic ordering
+            for i in range(len(shape) - 1, 0, -1):
+                result += crd2idx(crd % product(shape[i]), shape[i], stride[i])
+                crd = crd // product(shape[i])
+            return result + crd2idx(crd, shape[0], stride[0])
+        else:  # "int" "int" "int"
+            assert not is_tuple(shape) and not is_tuple(stride)
+            return crd * stride  # all are ints after type checks
+
+
+# Transform crd into the dst_shape's iteration space
+def crd2crd(
+    crd: IntTuple, dst_shape: IntTuple, src_shape: Optional[IntTuple] = None
+) -> IntTuple:
+    if is_tuple(crd):
+        if is_tuple(dst_shape):  # tuple tuple
+            assert len(crd) == len(dst_shape)
+            return tuple(crd2crd(x, y) for x, y in zip(crd, dst_shape))
+        else:  # tuple "int"
+            # Ambiguous unless we have src_shape
+            assert src_shape is not None
+            return crd2idx(crd, src_shape)
+    else:
+        if is_tuple(dst_shape):  # "int" tuple
+            return idx2crd(crd, dst_shape)
+        else:  # "int" "int"
+            assert crd < dst_shape
+            return crd
+
+
+# Filter trg according to crd: keep only elements of trg that are paired with None
+def slice_(crd: Union[None, tuple, int], trg: Union[tuple, int]) -> Union[tuple, int]:
+    if is_tuple(crd):
+        if is_tuple(trg):  # tuple tuple
+            assert len(crd) == len(trg)
+            # match C++ behavior of `filter_tuple` using `tuple_cat(...)`
+            return tuple(
+                chain(
+                    *filter(  # type: ignore[arg-type]  # filter returns Iterator which is compatible
+                        lambda x: x != (),
+                        [slice_(c, s) for c, s in zip(crd, trg)],
+                    )
+                )
+            )
+        else:
+            raise AssertionError("Invalid combination: tuple crd with int trg")
+    elif crd is None:
+        # match C++ behavior `return cute::tuple<B>{b};`
+        return (trg,)
+    else:
+        return ()
+
+
+# Determine if None appears at any of an int_tuples' terminals
+def has_none(a: Union[None, tuple, int]) -> bool:
+    if is_tuple(a):
+        return any(has_none(v) for v in a)
+    else:
+        return a is None
diff --git a/torch/distributed/_pycute/layout.py b/torch/distributed/_pycute/layout.py
new file mode 100644
index 0000000000000..dab5a08a9ea55
--- /dev/null
+++ b/torch/distributed/_pycute/layout.py
@@ -0,0 +1,467 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Definition of CuTe Layouts and functions to manipulate them which works with the order
+of lexicographic instead of co-lexicographic as implemented in the original layout.py
+"""
+
+from itertools import chain
+from typing import Optional, Union
+from typing_extensions import TypeAlias, TypeIs
+
+from .int_tuple import (
+    crd2idx,
+    flatten,
+    has_none,
+    IntTuple,
+    is_int,
+    is_tuple,
+    product,
+    slice_,
+    suffix_product,
+)
+
+
+# Type aliases
+LayoutOrIntTuple: TypeAlias = Union["Layout", IntTuple]
+LayoutProfile: TypeAlias = Optional[Union[tuple[object, ...], "Layout"]]
+LayoutInput: TypeAlias = Optional[Union["Layout", IntTuple, tuple[object, ...]]]
+CoordinateType: TypeAlias = Optional[
+    Union[int, IntTuple, tuple[object, ...]]
+]  # Input for slice_ and crd2idx functions
+
+
+class LayoutBase:
+    pass
+
+
+def is_layout(x: object) -> TypeIs["Layout"]:
+    return isinstance(x, LayoutBase)
+
+
+class Layout(LayoutBase):
+    def __init__(self, _shape: IntTuple, _stride: Optional[IntTuple] = None) -> None:
+        self.shape = _shape
+        if _stride is None:
+            self.stride = suffix_product(self.shape)
+        else:
+            self.stride = _stride
+
+    # operator ==
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Layout):
+            return False
+        return self.shape == other.shape and self.stride == other.stride
+
+    # operator len(L)  (len [rank] like tuples)
+    def __len__(self) -> int:
+        if is_tuple(self.shape):
+            return len(self.shape)
+        else:
+            return 1
+
+    # operator ()    (map coord to idx)
+    def __call__(self, *args: CoordinateType) -> Union["Layout", int]:
+        """
+        Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
+        OR
+        Slice the layout and return the sublayout (Coord has an Underscore slice op)
+
+        Follow the same behavior of `Layout::operator(Coord const&)` in cute C++
+        """
+        if has_none(args):
+            if len(args) == 1:
+                return Layout(slice_(args[0], self.shape), slice_(args[0], self.stride))
+            else:
+                return Layout(slice_(args, self.shape), slice_(args, self.stride))
+        else:
+            if len(args) == 1:
+                return crd2idx(args[0], self.shape, self.stride)  # type: ignore[arg-type]
+            else:
+                return crd2idx(args, self.shape, self.stride)  # type: ignore[arg-type]
+
+    # operator []    (get-i like tuples)
+    def __getitem__(self, i: int) -> "Layout":
+        if is_tuple(self.shape):
+            return Layout(self.shape[i], self.stride[i])  # type: ignore[index]
+        else:
+            assert i == 0
+            return Layout(self.shape, self.stride)
+
+    # size(layout)   Size of the domain
+    def size(self) -> int:
+        return product(self.shape)
+
+    # cosize(layout)   Size of the codomain
+    def cosize(self) -> int:
+        return self(self.size() - 1) + 1  # type: ignore[operator]
+
+    # print and str
+    def __str__(self) -> str:
+        return f"{self.shape}:{self.stride}"
+
+    # error msgs and representation
+    def __repr__(self) -> str:
+        return f"Layout({self.shape},{self.stride})"
+
+
+# Make Layout from a list of layouts (each layout it's own mode in the result)
+def make_layout(*layouts: Union[Layout, tuple[Layout, ...]]) -> Layout:
+    if len(layouts) == 1 and not is_layout(layouts[0]):
+        layouts = layouts[0]
+
+    shape, stride = zip(*((a.shape, a.stride) for a in layouts))  # type: ignore[union-attr]
+    return Layout(shape, stride)
+
+
+# Size of the domain
+def size(layout: LayoutOrIntTuple) -> int:
+    if is_layout(layout):
+        return layout.size()
+    return product(layout)
+
+
+# Size of the codomain
+def cosize(layout: Layout) -> int:
+    return layout.cosize()
+
+
+# Layout coalesce -- flatten and combine as many modes as possible while preserving the int-to-int function
+def coalesce(layout: Layout, profile: LayoutProfile = None) -> Layout:
+    if is_tuple(profile):
+        assert len(layout) >= len(profile)
+        return make_layout(
+            chain(
+                (coalesce(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
+                (layout[i] for i in range(len(profile), len(layout))),
+            )
+        )
+
+    result_shape = [1]
+    result_stride = [0]
+    # Since we now follow lexicographic order, we need to process from right to left.
+    # And to make implementation more efficient, we append to the end of list and reverse it in the end.
+    for shape, stride in zip(
+        reversed(flatten(layout.shape)), reversed(flatten(layout.stride))
+    ):
+        # skip their shape-1s
+        if shape == 1:
+            continue
+        # replace our shape-1 with anything
+        elif result_shape[-1] == 1:
+            result_shape[-1] = shape
+            result_stride[-1] = stride
+        # merge modes if the shape*stride match
+        elif result_shape[-1] * result_stride[-1] == stride:
+            result_shape[-1] = result_shape[-1] * shape
+        # append a new mode
+        else:
+            result_shape.append(shape)
+            result_stride.append(stride)
+
+    if len(result_shape) == 1:
+        return Layout(result_shape[0], result_stride[0])
+    else:
+        result_shape.reverse()
+        result_stride.reverse()
+        return Layout(tuple(result_shape), tuple(result_stride))
+
+
+# Layout filter -- replace all stride-0 modes with size-1 and then coalesce to remove them
+def filter(layout: Layout, profile: LayoutProfile = None) -> Layout:
+    if is_tuple(profile):
+        assert len(layout) >= len(profile)
+        return make_layout(
+            chain(
+                (filter(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
+                (layout[i] for i in range(len(profile), len(layout))),
+            )
+        )
+
+    result_shape = []
+    result_stride = []
+    for shape, stride in zip(flatten(layout.shape), flatten(layout.stride)):
+        # skip their shape-1s and stride-0s
+        if not (shape == 1 or stride == 0):
+            result_shape.append(shape)
+            result_stride.append(stride)
+
+    if len(result_shape) == 0:
+        return Layout(1, 0)
+    else:
+        return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout composition
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def composition(layoutA: Layout, layoutB: LayoutInput) -> Layout:
+    if layoutB is None:
+        return layoutA
+    elif is_int(layoutB):
+        return composition(layoutA, Layout(layoutB))
+    elif is_tuple(layoutB):
+        assert len(layoutA) >= len(layoutB)
+        return make_layout(
+            chain(
+                (composition(layoutA[i], layoutB[i]) for i in range(0, len(layoutB))),  # type: ignore[arg-type]
+                (layoutA[i] for i in range(len(layoutB), len(layoutA))),
+            )
+        )
+    elif is_tuple(layoutB.shape):
+        return make_layout(composition(layoutA, layoutB_i) for layoutB_i in layoutB)  # type: ignore[arg-type, attr-defined]
+
+    if layoutB.stride == 0:
+        return Layout(layoutB.shape, 0)
+    else:
+        result_shape = []
+        result_stride = []
+        rest_shape = layoutB.shape
+        rest_stride = layoutB.stride
+        flat_A = coalesce(layoutA)
+        # when left layout is multi-dimensional sublayout, aka, self = (a,b,...,c):(x,y,...,z), layout = s:d,
+        # for integral s and d means that we want:
+        # (1) “remove” the first d elements from left, starting from rightmost. (This will increase the stride.)
+        # (2) “keep” the first s of those strided elements. (This does not affect the stride.)
+        # For example, if self = (6,2):(2,1), layout = (3:2)
+        # Step 1: remove the first 2 elements from self with stride increase, i.e., (6,2):(2,1) -> (6,1):(2,2)
+        # Step 2: keep the first 3 of those strided elements, i.e., (6,1):(2,2) -> (3,1):(2,2)
+        # Because we are going lexicographically, we go through left layout from right to left.
+        for curr_shape, curr_stride in zip(
+            reversed(flatten(flat_A.shape)[1:]), reversed(flatten(flat_A.stride)[1:])
+        ):
+            assert curr_shape % rest_stride == 0 or rest_stride % curr_shape == 0  # type: ignore[operator]
+            new_shape = min(max(1, curr_shape // rest_stride), rest_shape)  # type: ignore[operator]
+
+            if new_shape != 1:
+                result_shape.append(new_shape)  # Append to end, will reverse later
+                result_stride.append(rest_stride * curr_stride)
+
+            rest_shape = rest_shape // new_shape  # type: ignore[operator]
+            rest_stride = -(
+                -rest_stride // curr_shape  # type: ignore[operator]
+            )  # Python exclusive impl: "//" is always floor div so == ceil_div(abs(rest_stride), curr_shape) * signum(rest_stride)
+
+        # When left has single-size sublayout or reach the last sublayout, aka, left = a:b, layout = s:d,
+        # the result is rather trivial: left o layout = a:b o s:d = s:(b*d).
+        # For example, if self = (6:2), layout = (3:2), the result is (3:(2*2)) = (3:4).
+        if rest_shape != 1 or len(result_shape) == 0:
+            result_shape.append(rest_shape)  # Append to end, will reverse later
+            result_stride.append(rest_stride * flatten(flat_A.stride)[0])
+
+        # Reverse the lists because we build lists in reverse order (append to end), this way it is more efficient.
+        result_shape.reverse()
+        result_stride.reverse()
+
+        if len(result_shape) == 1:
+            return Layout(result_shape[0], result_stride[0])  # type: ignore[arg-type]
+        else:
+            return Layout(tuple(result_shape), tuple(result_stride))  # type: ignore[arg-type]
+
+
+# Layout complement
+def complement(layout: LayoutOrIntTuple, max_idx: int = 1) -> Layout:
+    if is_int(layout):
+        return complement(Layout(layout))
+
+    result_shape = []
+    result_stride = []
+    current_idx = 1
+
+    sorted_DS = sorted(zip(flatten(layout.stride), flatten(layout.shape)))  # type: ignore[union-attr]
+    for stride, shape in sorted_DS:
+        if stride == 0 or shape == 1:
+            continue
+
+        in_bound = current_idx <= shape * stride
+        # To support symbolic value which can't be evaluated now
+        assert (type(in_bound) is not bool) or in_bound
+
+        result_shape.append(stride // current_idx)
+        result_stride.append(current_idx)
+        current_idx = shape * stride
+
+    result_shape.append((max_idx + current_idx - 1) // current_idx)  # ceil_div
+    result_stride.append(current_idx)
+    # This is different from original pycute implementation, because we want to follow the lexicographic order here
+    # where the right-most dimension is the innermost dimension (smallest stride).
+    result_shape.reverse()
+    result_stride.reverse()
+
+    return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout right inverse
+def right_inverse(layout: Optional[LayoutOrIntTuple]) -> Optional[Layout]:
+    if layout is None:
+        return None
+    elif is_int(layout):
+        return Layout(layout)
+
+    result_shape = []
+    result_stride = []
+    current_idx = 1
+
+    flat_shape = flatten(layout.shape)  # type: ignore[union-attr]
+    flat_stride = flatten(layout.stride)  # type: ignore[union-attr]
+    sorted_DSA = sorted(zip(flat_stride, flat_shape, suffix_product(flat_shape)))  # type: ignore[arg-type]
+    for stride, shape, rstride in sorted_DSA:
+        if shape == 1:
+            continue
+        if current_idx != stride:
+            break
+
+        result_shape.append(shape)
+        result_stride.append(rstride)
+        current_idx = shape * stride
+
+    result_shape.reverse()
+    result_stride.reverse()
+    return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout left inverse
+def left_inverse(layout: Optional[LayoutOrIntTuple]) -> Optional[Layout]:
+    if layout is None:
+        return None
+    elif is_int(layout):
+        return Layout(layout)
+    return right_inverse(make_layout(complement(layout), layout))  # type: ignore[arg-type]
+
+
+# Split a layout by the composition of B and the "rest"
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_divide(layoutA: Layout, layoutB: LayoutInput) -> Layout:
+    if layoutB is None:
+        return layoutA
+    elif is_int(layoutB):
+        return logical_divide(layoutA, Layout(layoutB))
+    elif is_tuple(layoutB):
+        assert len(layoutA) >= len(layoutB)
+        return make_layout(
+            chain(
+                (
+                    logical_divide(layoutA[i], layoutB[i])  # type: ignore[arg-type]
+                    for i in range(0, len(layoutB))
+                ),
+                (layoutA[i] for i in range(len(layoutB), len(layoutA))),
+            )
+        )
+
+    return composition(
+        layoutA,
+        make_layout(layoutB, complement(layoutB, size(layoutA))),
+    )
+
+
+# Reproduce a layoutA over a layoutB
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_product(layoutA: Layout, layoutB: LayoutInput) -> Layout:
+    if layoutB is None:
+        return layoutA
+    elif is_int(layoutB):
+        return logical_divide(layoutA, Layout(layoutB))
+    elif is_tuple(layoutB):
+        assert len(layoutA) >= len(layoutB)
+        return make_layout(
+            chain(
+                (
+                    logical_product(layoutA[i], layoutB[i])  # type: ignore[arg-type]
+                    for i in range(0, len(layoutB))
+                ),
+                (layoutA[i] for i in range(len(layoutB), len(layoutA))),
+            )
+        )
+
+    return make_layout(
+        layoutA,
+        composition(complement(layoutA, size(layoutA) * cosize(layoutB)), layoutB),
+    )
+
+
+# Gather the modes from a hierarchical logical_divide or logical_product
+def hier_unzip(
+    splitter: object,
+    layoutA: Layout,
+    layoutB: LayoutInput,
+) -> Layout:
+    if layoutB is None:
+        return make_layout(Layout(1, 0), layoutA)
+    elif is_tuple(layoutB):
+        assert len(layoutA) >= len(layoutB)
+        # A layout with shape ((A,a),(B,b),(C,c))
+        split = make_layout(
+            hier_unzip(splitter, layoutA[i], layoutB[i])  # type: ignore[arg-type]
+            for i in range(0, len(layoutB))
+        )
+        # Gather to shape ((A,B,C,...),(a,b,c,...,y,z))
+        return make_layout(
+            make_layout(split[i][0] for i in range(0, len(layoutB))),  # type: ignore[arg-type]
+            make_layout(
+                chain(  # type: ignore[arg-type]
+                    (split[i][1] for i in range(0, len(layoutB))),
+                    (layoutA[i] for i in range(len(layoutB), len(layoutA))),
+                )
+            ),
+        )
+
+    # splitter must return a rank-2 layout
+    return splitter(layoutA, layoutB)  # type: ignore[operator]
+
+
+# Apply logical divide hierarchically and gather the split modes into two modes
+def zipped_divide(layoutA: Layout, layoutB: LayoutInput) -> Layout:
+    return hier_unzip(logical_divide, layoutA, layoutB)
+
+
+# Perform logical divide hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_divide(layoutA: Layout, layoutB: LayoutInput) -> Layout:
+    result = zipped_divide(layoutA, layoutB)
+    return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])  # type: ignore[arg-type]
+
+
+# Apply logical product hierarchically and gather the split modes into two modes
+def zipped_product(layoutA: Layout, layoutB: LayoutInput) -> Layout:
+    return hier_unzip(logical_product, layoutA, layoutB)
+
+
+# Perform logical product hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_product(layoutA: Layout, layoutB: LayoutInput) -> Layout:
+    result = zipped_product(layoutA, layoutB)
+    return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])  # type: ignore[arg-type]
+
+
+def slice_and_offset(crd: tuple[object, ...], layout: Layout) -> tuple[Layout, int]:
+    return (
+        Layout(slice_(crd, layout.shape), slice_(crd, layout.stride)),
+        crd2idx(crd, layout.shape, layout.stride),  # type: ignore[arg-type]
+    )
diff --git a/torch/distributed/_pycute/typing.py b/torch/distributed/_pycute/typing.py
new file mode 100644
index 0000000000000..5e6fe0a9c66e8
--- /dev/null
+++ b/torch/distributed/_pycute/typing.py
@@ -0,0 +1,42 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from abc import ABC
+
+
+class Integer(ABC):  # noqa: B024  # Uses __subclasshook__ instead of abstract methods
+    @classmethod
+    def __subclasshook__(cls, c: type) -> bool:
+        if c in [bool, float]:
+            return False
+
+        return issubclass(c, int)
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 8154cd9809139..77e05cf9b1622 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -1289,7 +1289,7 @@ def _fused_scaled_matmul_reduce_scatter_impl(
     def chunk_producer(rank: int, out: torch.Tensor) -> None:
         mm_out_op(A_shards[rank], B, scale_a=A_scale_shards[rank], **kwargs, out=out)
 
-    # Stacked partials will be the 2D outputs of the the pipelined scaled mm, and will
+    # Stacked partials will be the 2D outputs of the pipelined scaled mm, and will
     # have the shape (A_with_scatter_dim_0_tensor.shape[0], B.shape[1]) to align with the formula:
     # (a*b,c) @ (c,d) = (a*b,d)
     stacked_partials = A_with_scatter_dim_0.new_empty(
diff --git a/torch/distributed/_symmetric_memory/_nvshmem_triton.py b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
index 7b7828227d7d1..4bad8ff0ceb8a 100644
--- a/torch/distributed/_symmetric_memory/_nvshmem_triton.py
+++ b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
@@ -1,11 +1,16 @@
+import logging
 import os
 import subprocess
 import sysconfig
 from typing import Any, Optional
 
+import torch.distributed as dist
 from torch.utils._triton import has_triton
 
 
+logger = logging.getLogger(__name__)
+
+
 def _find_nvshmem_device_library() -> str:
     paths = [os.path.join(sysconfig.get_path("purelib"), "nvidia", "nvshmem", "lib")]
 
@@ -97,7 +102,7 @@ def nvshmem_init_hook(*args, **kwargs) -> None:  # type: ignore[no-untyped-def]
             key = kwargs["key"]
             device = kwargs["compile"]["device"]
             jit_function = kwargs["fn"].jit_function
-            kernel_cache, _, _, _ = jit_function.device_caches[device]
+            kernel_cache = jit_function.device_caches[device][0]
             kernel = kernel_cache.get(key, None)
             if kernel is not None:
                 kernel.run
@@ -147,7 +152,7 @@ def put(dest, source, nelems, pe):  # type: ignore[no-untyped-def]
         tl.static_assert(dest.type == source.type)
         nbytes = nelems * dest.type.element_ty.itemsize
         return putmem_block_extern_wrapper(
-            dest.to(tl.int64), source.to(tl.int64), nbytes, pe
+            dest.to(tl.int64), source.to(tl.int64), nbytes.to(tl.int64), pe
         )
 
     @core.extern
@@ -162,7 +167,7 @@ def putmem_block_extern_wrapper(dest, source, size_bytes, pe, _semantic=None):
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
                     core.dtype("int64"),  # size in bytes
-                    core.dtype("int64"),  # pe number
+                    core.dtype("int32"),  # pe number
                 ): ("nvshmemx_putmem_block", core.dtype("int32"))
             },
             is_pure=False,
@@ -200,7 +205,7 @@ def get(dest, source, nelems, pe):  # type: ignore[no-untyped-def]
         tl.static_assert(dest.type == source.type)
         nbytes = nelems * dest.type.element_ty.itemsize
         return getmem_block_extern_wrapper(
-            dest.to(tl.int64), source.to(tl.int64), nbytes, pe
+            dest.to(tl.int64), source.to(tl.int64), nbytes.to(tl.int64), pe
         )
 
     @core.extern
@@ -215,23 +220,22 @@ def getmem_block_extern_wrapper(dest, source, size_bytes, pe, _semantic=None):
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
                     core.dtype("int64"),  # size in bytes
-                    core.dtype("int64"),  # pe number
+                    core.dtype("int32"),  # pe number
                 ): ("nvshmemx_getmem_block", core.dtype("int32"))
             },
             is_pure=False,
             _semantic=_semantic,
         )
 
-    @core.extern
+    @triton.jit  # type: ignore[misc]
     def putmem_signal_block(  # type: ignore[no-untyped-def]
         dst,
         src,
         size_bytes,
-        sig_addr,
         signal,
+        sig_val,
         sig_op,
         pe,
-        _semantic=None,
     ):  # type: ignore[no-untyped-def]
         """
         Put data to remote PE with atomic signal operation using block-scoped operation.
@@ -241,17 +245,16 @@ def putmem_signal_block(  # type: ignore[no-untyped-def]
         This enables efficient point-to-point synchronization between PEs.
 
         Args:
-            dst (int64): Symmetric address of the destination data object on the remote PE.
-            src (int64): Local address of the source data object containing data to be copied.
+            dst (tensor): A tensor on calling PE symmetric to the destination tensor on remote PE.
+            src (tensor): Local tensor containing the source data.
             size_bytes (int64): Number of bytes to transfer. Must be positive.
-            sig_addr (int64): Symmetric address of the signal variable (uint64_t) on the remote PE.
+            signal (tensor): Symmetric signal pad with remote PE.
                              Must be 8-byte aligned symmetric memory.
             signal (int64): Value to be used in the signal operation.
-            sig_op (int64): Signal operation type. Common values:
+            sig_op (int32): Signal operation type. Common values:
                            - NVSHMEM_SIGNAL_SET (0): Atomic set operation
                            - NVSHMEM_SIGNAL_ADD (5): Atomic add operation
-            pe (int64): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
-            _semantic: Optional semantic information for Triton compilation.
+            pe (int32): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
 
         Returns:
             int32: Status code (0 for success).
@@ -273,19 +276,42 @@ def putmem_signal_block(  # type: ignore[no-untyped-def]
             )
             ```
         """
+        # Ensure sig_val is 64 bits
+        sig_val = 0 << 32 | sig_val
+        return putmem_signal_block_extern_wrapper(
+            dst.to(tl.int64),
+            src.to(tl.int64),
+            size_bytes.to(tl.int64),
+            signal.to(tl.int64),
+            sig_val.to(tl.uint64),
+            sig_op,
+            pe,
+        )
+
+    @core.extern
+    def putmem_signal_block_extern_wrapper(  # type: ignore[no-untyped-def]
+        dst,
+        src,
+        size_bytes,
+        signal,
+        sig_val,
+        sig_op,
+        pe,
+        _semantic=None,
+    ):  # type: ignore[no-untyped-def]
         return core.extern_elementwise(
             "",
             "",
-            [dst, src, size_bytes, sig_addr, signal, sig_op, pe],
+            [dst, src, size_bytes, signal, sig_val, sig_op, pe],
             {
                 (
                     core.dtype("int64"),
                     core.dtype("int64"),
                     core.dtype("int64"),
                     core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
+                    core.dtype("uint64"),
+                    core.dtype("int32"),
+                    core.dtype("int32"),
                 ): ("nvshmemx_putmem_signal_block", core.dtype("int32"))
             },
             is_pure=False,
@@ -327,8 +353,8 @@ def wait_until(ivar, cmp_op, cmp_val):  # type: ignore[no-untyped-def]
             ```
         """
         tl.static_assert(
-            ivar.type.element_ty.itemsize == 8,
-            "wait_until expects a 64-bit type for the synchronization variable",
+            ivar.type.element_ty.itemsize == 4,
+            "wait_until expects a 32-bit type for the synchronization variable",
         )
         return wait_until_extern_wrapper(ivar.to(tl.int64), cmp_op, cmp_val)
 
@@ -341,16 +367,16 @@ def wait_until_extern_wrapper(ivar, cmp, cmp_val, _semantic=None):  # type: igno
             {
                 (
                     core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
-                ): ("nvshmem_longlong_wait_until", core.dtype("int32"))
+                    core.dtype("int32"),
+                    core.dtype("int32"),
+                ): ("nvshmem_int_wait_until", core.dtype("int32"))
             },
             is_pure=False,
             _semantic=_semantic,
         )
 
-    @core.extern
-    def signal_wait_until(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[no-untyped-def]
+    @triton.jit  # type: ignore[misc]
+    def signal_wait_until(signal, cmp, cmp_val):  # type: ignore[no-untyped-def]
         """
         Wait until a signal variable meets a specified condition.
 
@@ -360,9 +386,9 @@ def signal_wait_until(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[n
         with signal operations.
 
         Args:
-            sig_addr (int64): Symmetric address of the signal variable (uint64_t).
+            signal (tensor): Symmetric signal tensor with remote PE.
                              Must be 8-byte aligned symmetric memory.
-            cmp (int64): Comparison operator. Common values:
+            cmp (int32): Comparison operator. Common values:
                         - NVSHMEM_CMP_EQ (0): Wait until signal == cmp_val
                         - NVSHMEM_CMP_NE (1): Wait until signal != cmp_val
                         - NVSHMEM_CMP_GT (2): Wait until signal > cmp_val
@@ -370,7 +396,6 @@ def signal_wait_until(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[n
                         - NVSHMEM_CMP_LT (4): Wait until signal < cmp_val
                         - NVSHMEM_CMP_LE (5): Wait until signal <= cmp_val
             cmp_val (int64): Value to compare against.
-            _semantic: Optional semantic information for Triton compilation.
 
         Returns:
             int32: Status code (0 for success).
@@ -389,15 +414,22 @@ def signal_wait_until(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[n
             nvshmem.signal_wait_until(signal_ptr, NVSHMEM_CMP_EQ, 42)
             ```
         """
+        cmp_val = 0 << 32 | cmp_val
+        return signal_wait_until_extern_wrapper(
+            signal.to(tl.int64), cmp, cmp_val.to(tl.uint64)
+        )
+
+    @core.extern
+    def signal_wait_until_extern_wrapper(signal, cmp, cmp_val, _semantic=None):  # type: ignore[no-untyped-def]
         return core.extern_elementwise(
             "",
             "",
-            [sig_addr, cmp, cmp_val],
+            [signal, cmp, cmp_val],
             {
                 (
                     core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
+                    core.dtype("int32"),
+                    core.dtype("uint64"),
                 ): ("nvshmem_signal_wait_until", core.dtype("int32"))
             },
             is_pure=False,
@@ -417,10 +449,10 @@ def signal_op(sig_addr, signal, sig_op, pe, _semantic=None):  # type: ignore[no-
             sig_addr (int64): Symmetric address of the signal variable (uint64_t) on the remote PE.
                              Must be 8-byte aligned symmetric memory.
             signal (int64): Value to be used in the signal operation.
-            sig_op (int64): Signal operation type. Common values:
+            sig_op (int32): Signal operation type. Common values:
                            - NVSHMEM_SIGNAL_SET (0): Atomically set sig_addr = signal
                            - NVSHMEM_SIGNAL_ADD (5): Atomically set sig_addr += signal
-            pe (int64): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
+            pe (int32): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
             _semantic: Optional semantic information for Triton compilation.
 
         Returns:
@@ -448,8 +480,8 @@ def signal_op(sig_addr, signal, sig_op, pe, _semantic=None):  # type: ignore[no-
                 (
                     core.dtype("int64"),
                     core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
+                    core.dtype("int32"),
+                    core.dtype("int32"),
                 ): ("nvshmemx_signal_op", core.dtype("int32"))
             },
             is_pure=False,
@@ -764,7 +796,7 @@ def alltoall(team, dest, source, nelems_per_pe):  # type: ignore[no-untyped-def]
         tl.static_assert(dest.type == source.type)
         size_bytes_per_pe = nelems_per_pe * dest.type.element_ty.itemsize
         return alltoallmem_block_extern_wrapper(
-            team, dest.to(tl.int64), source.to(tl.int64), size_bytes_per_pe
+            team, dest.to(tl.int64), source.to(tl.int64), size_bytes_per_pe.to(tl.int64)
         )
 
     @core.extern  # type: ignore[misc]
@@ -778,7 +810,7 @@ def alltoallmem_block_extern_wrapper(
             [team, dest, source, size_bytes],
             {
                 (
-                    core.dtype("int64"),  # team handle
+                    core.dtype("int32"),  # team handle
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
                     core.dtype("int64"),  # size in bytes
@@ -819,7 +851,7 @@ def broadcast(team, dest, source, nelems, pe_root):  # type: ignore[no-untyped-d
         tl.static_assert(dest.type == source.type)
         nbytes = nelems * dest.type.element_ty.itemsize
         return broadcastmem_block_extern_wrapper(
-            team, dest.to(tl.int64), source.to(tl.int64), nbytes, pe_root
+            team, dest.to(tl.int64), source.to(tl.int64), nbytes.to(tl.int64), pe_root
         )
 
     @core.extern  # type: ignore[misc]
@@ -838,11 +870,11 @@ def broadcastmem_block_extern_wrapper(
             [team, dest, source, size_bytes, pe_root],
             {
                 (
-                    core.dtype("int64"),  # team handle
+                    core.dtype("int32"),  # team handle
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
                     core.dtype("int64"),  # size in bytes
-                    core.dtype("int64"),  # pe_root
+                    core.dtype("int32"),  # pe_root
                 ): ("nvshmemx_broadcastmem_block", core.dtype("int32"))
             },
             is_pure=False,
@@ -883,7 +915,7 @@ def reduce(team, dest, source, nreduce, operation: tl.constexpr):  # type: ignor
             team,
             dest.to(tl.int64),
             source.to(tl.int64),
-            nreduce,
+            nreduce.to(tl.int64),
             operation,
             dtype,
         )
@@ -966,7 +998,7 @@ def reduce_extern_wrapper(
 
         # Define function signature - all parameters are int64 in Triton (they are just ptrs)
         signature = (
-            core.dtype("int64"),  # team handle
+            core.dtype("int32"),  # team handle
             core.dtype("int64"),  # destination pointer
             core.dtype("int64"),  # source pointer
             core.dtype("int64"),  # number of elements
@@ -980,3 +1012,27 @@ def reduce_extern_wrapper(
             is_pure=False,
             _semantic=_semantic,
         )
+
+    # Utility for inspecting Triton kernels
+
+    triton_kernels: dict = {}
+
+    def _log_triton_kernel(kernel) -> None:  # type: ignore[no-untyped-def]
+        import atexit
+        import tempfile
+
+        if dist.is_initialized() and dist.get_rank() != 0:
+            return
+
+        def on_exit() -> None:
+            logger.info("PTX files:")
+            for kernel in triton_kernels:
+                with tempfile.NamedTemporaryFile(dir="/tmp", delete=False) as f:
+                    f.write(kernel.asm["ptx"].encode("utf-8"))
+                    logger.info(f"+- {kernel.name}: {f.name}")  # noqa: G004
+
+        if len(triton_kernels) == 0:
+            atexit.register(on_exit)
+
+        if kernel not in triton_kernels:
+            triton_kernels[kernel] = None
diff --git a/torch/distributed/checkpoint/_async_process_executor.py b/torch/distributed/checkpoint/_async_process_executor.py
index e708433058440..d9c6de79b32b7 100644
--- a/torch/distributed/checkpoint/_async_process_executor.py
+++ b/torch/distributed/checkpoint/_async_process_executor.py
@@ -4,6 +4,7 @@
 import os
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
+from datetime import timedelta
 from enum import Enum
 from typing import Any, Optional, Union
 from uuid import uuid4
@@ -113,9 +114,16 @@ def __init__(
 
     def __del__(self) -> None:
         if self._save_process.is_alive():
-            logger.info("Terminating the checkpoint background process...")
-            self._send(_CheckpointSaveProcessControlOpts.TERMINATE)
-            self._save_process.join()
+            try:
+                logger.info("Terminating the checkpoint background process.")
+                self._send(_CheckpointSaveProcessControlOpts.TERMINATE)
+                self._save_process.join(timeout=5)
+            finally:
+                if self._save_process.is_alive():
+                    logger.warning(
+                        "Checkpoint background process is still alive after termination request. Sending SIGTERM."
+                    )
+                    self._save_process.terminate()
 
     def _send(self, data: Any) -> None:
         self._process_pipe.send(data)
@@ -215,7 +223,9 @@ def _checkpointing_subprocess(
                 "Initializing dist.ProcessGroup in checkpoint background process"
             )
             # NOTE: GLOO backend is enforced here.
-            dist.init_process_group(backend=dist.Backend.GLOO)
+            dist.init_process_group(
+                backend=dist.Backend.GLOO, timeout=timedelta(seconds=600)
+            )
             dist.barrier()
 
             logger.info("Checkpoint background process is running...")
diff --git a/torch/distributed/checkpoint/quantized_hf_storage.py b/torch/distributed/checkpoint/quantized_hf_storage.py
index 1bc8b852ed815..734d1a21a1559 100644
--- a/torch/distributed/checkpoint/quantized_hf_storage.py
+++ b/torch/distributed/checkpoint/quantized_hf_storage.py
@@ -6,6 +6,7 @@
 
 import torch
 from torch.distributed.checkpoint._hf_utils import _metadata_fn
+from torch.distributed.checkpoint.metadata import TensorStorageMetadata
 from torch.distributed.checkpoint.planner import LoadPlanner, ReadItem
 
 from .hf_storage import HuggingFaceStorageReader
@@ -50,10 +51,20 @@ def __init__(
         self._weight_scale_mapping: dict[str, str] = {}
         # Track which file contains each tensor
         self._weight_map: dict[str, str] = {}
+        # Cache for full tensor shapes (fqn -> shape)
+        self._tensor_full_shapes: dict[str, torch.Size] = {}
 
     def read_metadata(self) -> Any:
+        metadata = super().read_metadata()
+        # Build a cache of FQN -> full tensor shape for faster lookups.
+        for fqn, tensor_metadata in metadata.state_dict_metadata.items():
+            # Only process TensorStorageMetadata which has size attribute
+            if isinstance(tensor_metadata, TensorStorageMetadata):
+                self._tensor_full_shapes[fqn] = tensor_metadata.size
+
         self._load_quantization_metadata()
-        return super().read_metadata()
+
+        return metadata
 
     def _load_quantization_metadata(self):
         """Load quantization metadata from the checkpoint."""
@@ -103,64 +114,121 @@ def _process_read_request(
         target_tensor.copy_(tensor)
         planner.commit_tensor(req, target_tensor)
 
-    def _calculate_scale_shape(
-        self, weight: torch.Tensor, block_size: int
-    ) -> tuple[int, int]:
-        """Calculate expected scale tensor shape based on weight tensor and block size."""
-        rows, cols = weight.shape
-        block_rows = (rows + block_size - 1) // block_size  # Ceiling division
-        block_cols = (cols + block_size - 1) // block_size  # Ceiling division
-        return (block_rows, block_cols)
+    def _get_slice_to_block_mapping(
+        self, req: ReadItem
+    ) -> tuple[tuple[int, int], tuple[int, int], slice, slice]:
+        """
+        Calculate which blocks correspond to the requested slice.
+
+        Args:
+            req: Read request containing tensor info and required slices
+
+        Returns:
+            Tuple of (row_block_range, col_block_range, row_slice, col_slice)
+        """
+        # Get the slice information
+        row_slice = slice(
+            req.storage_offsets[0], req.storage_offsets[0] + req.lengths[0]
+        )
+        col_slice = slice(
+            req.storage_offsets[1], req.storage_offsets[1] + req.lengths[1]
+        )
+
+        # Calculate which blocks this slice spans
+        row_start_block = row_slice.start // self.block_size
+        row_end_block = (row_slice.stop - 1) // self.block_size + 1  # Inclusive end
+
+        col_start_block = col_slice.start // self.block_size
+        col_end_block = (col_slice.stop - 1) // self.block_size + 1  # Inclusive end
+
+        return (
+            (row_start_block, row_end_block),
+            (col_start_block, col_end_block),
+            row_slice,
+            col_slice,
+        )
 
     def _dequantize_tensor(
         self,
         weight: torch.Tensor,
         scale_inv: torch.Tensor,
+        full_tensor_shape: torch.Size,
+        slice_info: tuple[tuple[int, int], tuple[int, int], slice, slice],
     ) -> torch.Tensor:
         """
-        Dequantize tensor using block-wise scaling.
+        Dequantize a sliced tensor using the appropriate portion of the scale tensor.
 
         Args:
-            weight: Quantized weight tensor
-            scale_inv: Scale inverse tensor for dequantization
+            weight: Sliced quantized weight tensor
+            scale_inv: Full scale inverse tensor for dequantization
+            full_tensor_shape: Shape of the original full tensor
+            slice_info: Block mapping information from _get_slice_to_block_mapping
 
         Returns:
             Dequantized tensor
         """
+        (row_block_range, col_block_range, row_slice, col_slice) = slice_info
+
         # Convert to float32 for computation
         # Certain quantized dtypes like Float8_e4m3fn
         # don't support multiplication on CPU yet in PyTorch.
         upcasted_weight = weight.to(torch.float32)
 
-        # Get original dimensions
-        orig_shape = weight.shape
-
-        # Calculate block dimensions for the local shard
-        expected_scale_shape = self._calculate_scale_shape(weight, self.block_size)
-        block_rows, block_cols = expected_scale_shape
-
         # Create output tensor in target dtype
         dequantized = weight.detach().to(dtype=self.target_dtype, copy=True)
 
-        # Apply scaling factors to each block
-        for i in range(block_rows):
-            row_start = i * self.block_size
-            row_end = min(row_start + self.block_size, orig_shape[0])
-
-            for j in range(block_cols):
-                col_start = j * self.block_size
-                col_end = min(col_start + self.block_size, orig_shape[1])
-
-                # Get the block
-                block = upcasted_weight[row_start:row_end, col_start:col_end]
-
-                scale = scale_inv[i, j]
+        # Get the actual slice boundaries
+        row_start_global = row_slice.start
+        row_end_global = row_slice.stop
+        col_start_global = col_slice.start
+        col_end_global = col_slice.stop
+
+        # Apply scaling factors to each block that intersects with our slice
+        for block_i in range(row_block_range[0], row_block_range[1]):
+            for block_j in range(col_block_range[0], col_block_range[1]):
+                # Calculate the block boundaries in global coordinates
+                block_row_start_global = block_i * self.block_size
+                block_row_end_global = min(
+                    block_row_start_global + self.block_size, full_tensor_shape[0]
+                )
+                block_col_start_global = block_j * self.block_size
+                block_col_end_global = min(
+                    block_col_start_global + self.block_size, full_tensor_shape[1]
+                )
+
+                # Find the intersection of the block with our slice
+                intersect_row_start = max(block_row_start_global, row_start_global)
+                intersect_row_end = min(block_row_end_global, row_end_global)
+                intersect_col_start = max(block_col_start_global, col_start_global)
+                intersect_col_end = min(block_col_end_global, col_end_global)
+
+                # Skip if no intersection
+                if (
+                    intersect_row_start >= intersect_row_end
+                    or intersect_col_start >= intersect_col_end
+                ):
+                    continue
+
+                # Convert global coordinates to local coordinates in the sliced tensor
+                local_row_start = intersect_row_start - row_start_global
+                local_row_end = intersect_row_end - row_start_global
+                local_col_start = intersect_col_start - col_start_global
+                local_col_end = intersect_col_end - col_start_global
+
+                # Get the block from the sliced tensor
+                block = upcasted_weight[
+                    local_row_start:local_row_end, local_col_start:local_col_end
+                ]
+
+                # Apply the scale factor
+                scale = scale_inv[block_i, block_j]
                 block = block * scale
 
-                # Explicitly convert block to target dtype
+                # Convert block to target dtype and store
                 block_converted = block.to(dtype=self.target_dtype)
-                # Store the dequantized block
-                dequantized[row_start:row_end, col_start:col_end] = block_converted
+                dequantized[
+                    local_row_start:local_row_end, local_col_start:local_col_end
+                ] = block_converted
 
         return dequantized
 
@@ -202,15 +270,14 @@ def _read_quantized_tensor_with_block_alignment(
         scale_fqn = self._weight_scale_mapping[tensor_fqn]
 
         try:
-            # Load the quantized weight
+            # Load the sliced quantized weight
             weight_slices = tuple(
                 slice(offset, offset + length)
                 for offset, length in zip(req.storage_offsets, req.lengths)
             )
             quantized_tensor = safetensor_file.get_slice(tensor_fqn)[weight_slices]
 
-            # Load the corresponding scale inverse tensor
-            # Use weight_map to find the correct file for the scale tensor
+            # Load the corresponding scale inverse tensor (full tensor)
             scale_file_name = self._weight_map.get(scale_fqn)
             if scale_file_name is None:
                 raise ValueError(f"Scale tensor {scale_fqn} not found in weight_map")
@@ -231,10 +298,20 @@ def _read_quantized_tensor_with_block_alignment(
                 ) as scale_file:
                     scale_inv = scale_file.get_tensor(scale_fqn)
 
-            # Perform dequantization
+            # Get the full tensor shape from our O(1) lookup cache
+            full_tensor_shape = self._tensor_full_shapes.get(tensor_fqn)
+            if full_tensor_shape is None:
+                raise ValueError(f"Could not find full tensor shape for {tensor_fqn}")
+
+            # Get slice to block mapping
+            slice_info = self._get_slice_to_block_mapping(req)
+
+            # Perform dequantization with proper block alignment
             dequantized_tensor = self._dequantize_tensor(
                 weight=quantized_tensor,
                 scale_inv=scale_inv,
+                full_tensor_shape=full_tensor_shape,
+                slice_info=slice_info,
             )
 
             return dequantized_tensor
diff --git a/torch/distributed/checkpoint/staging.py b/torch/distributed/checkpoint/staging.py
index e7acf4975173c..c463b66ddd3f8 100644
--- a/torch/distributed/checkpoint/staging.py
+++ b/torch/distributed/checkpoint/staging.py
@@ -57,7 +57,7 @@ class AsyncStager(Protocol):
 
     3. If AsyncStager.should_synchronize_after_execute is True, this method will be called immediately after
         the serialization thread starts and before returning from dcp.async_save. If this is set to False,
-        the assumption is the user has defined a custom synchronization point for the the purpose of further
+        the assumption is the user has defined a custom synchronization point for the purpose of further
         optimizing save latency in the training loop (for example, by overlapping staging with the
         forward/backward pass), and it is the respondsibility of the user to call `AsyncStager.synchronize_staging`
         at the appropriate time.
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index 9971f19db8174..05175cf400e5a 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -312,10 +312,6 @@ def async_save(
                 )
             )
 
-    storage_writer = cast(
-        StorageWriter, _storage_setup(storage_writer, checkpoint_id, reader=False)
-    )
-
     state_dict = _stateful_to_state_dict(state_dict)
 
     @_dcp_method_logger(log_exceptions=True)
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index 904d1f84100cc..6ee9263db8cd4 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -7,7 +7,7 @@
 import warnings
 from collections.abc import Iterator
 from functools import reduce
-from itertools import chain, zip_longest
+from itertools import zip_longest
 from typing import Optional, TYPE_CHECKING, Union
 
 import torch
@@ -164,12 +164,15 @@ def create_flatten_mesh(
             if not mesh_dim_name:
                 mesh_dim_name = "_".join(not_none(device_mesh.mesh_dim_names))
 
+            # Flatten a 1D device mesh into its original mesh_dim_name will return itself.
+            if device_mesh.ndim == 1 and mesh_dim_name in not_none(
+                device_mesh.mesh_dim_names
+            ):
+                return device_mesh
+
             # Check whether the mesh_dim_name for flattened mesh is valid.
             self.flatten_name_to_root_dims.setdefault(root_mesh, {})
-            invalid_dim_names = chain(
-                list(not_none(root_mesh.mesh_dim_names)),
-                *self.flatten_name_to_root_dims[root_mesh].keys(),
-            )
+            invalid_dim_names = not_none(root_mesh.mesh_dim_names)
             if mesh_dim_name in invalid_dim_names:
                 raise RuntimeError(
                     f"{mesh_dim_name} already exists for submesh of the {root_mesh}. ",
@@ -178,8 +181,6 @@ def create_flatten_mesh(
                 )
 
             # Quick return if the flatten mesh has been created before.
-            # TODO: If we decide to restrict flatten initialization once, we should remove
-            # this check and throw an error if the flatten mesh is already created before.
             if (
                 root_mesh in self.root_to_flatten_mapping
                 and mesh_dim_name in self.root_to_flatten_mapping[root_mesh]
@@ -396,6 +397,9 @@ class DeviceMesh:
             device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
             mesh (ndarray): A multi-dimensional array or an integer tensor describing the layout
                 of devices, where the IDs are global IDs of the default process group.
+            _rank (int): (experimental/internal)
+                The global rank of the current process. If not provided, it will
+                be inferred from the default process group.
 
         Returns:
             DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
@@ -430,6 +434,7 @@ def __init__(
                 tuple[tuple[Optional[str], Optional[C10dBackend.Options]], ...]
             ] = None,
             _init_backend: bool = True,
+            _rank: Optional[int] = None,
         ) -> None:
             self.device_type = device_type
             if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
@@ -460,8 +465,11 @@ def __init__(
                 if is_initialized() and get_backend() == "threaded":
                     self._thread_id = threading.get_ident()
 
+                if _rank is None:
+                    _rank = get_rank()
+
                 # calculate the coordinates of the current global rank on the mesh
-                rank_coords = (self.mesh == get_rank()).nonzero()
+                rank_coords = (self.mesh == _rank).nonzero()
                 assert rank_coords.size(0) in (0, 1)
                 self._coordinate_on_dim: Optional[list[int]] = (
                     rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 40660b41fe3eb..75c973c4e2a6c 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -337,10 +337,12 @@ def register_backend(
             # assume default devices "cpu" and "cuda", but warn
             warnings.warn(
                 f"Device capability of {name} unspecified, assuming `cpu` and "
-                "`cuda`. Please specify it via the `devices` argument of "
+                "`cuda` or `xpu`. Please specify it via the `devices` argument of "
                 "`register_backend`."
             )
-            Backend.backend_capability[name.lower()] = ["cpu", "cuda"]
+            Backend.backend_capability[name.lower()] = (
+                ["cpu", "cuda", "xpu"] if torch.xpu.is_available() else ["cpu", "cuda"]
+            )
         elif isinstance(devices, str):
             # Single device string specified. Simply convert to list.
             Backend.backend_capability[name.lower()] = [devices]
@@ -1735,7 +1737,6 @@ def init_process_group(
             timeout=timeout,
             group_desc="default_pg",
         )
-        _update_default_pg(default_pg)
     else:
         # backward compatible API
         if store is None:
@@ -1766,7 +1767,8 @@ def init_process_group(
             device_id=device_id,
             group_desc="default_pg",
         )
-        _update_default_pg(default_pg)
+
+    _update_default_pg(default_pg)
 
     _world.pg_group_ranks[GroupMember.WORLD] = {  # type: ignore[index]
         i: i
@@ -5150,7 +5152,11 @@ def split_group(
             my_group = split_group
             break
 
-    group_name = _process_group_name(my_group, use_hashed_name=False)
+    # use_hashed_name is True to ensure that subgroups have unique names.
+    # This is needed as some backends (e.g. Gloo) use the group name as a
+    # PrefixStore prefix for initialization of splits. Thus, names have to be
+    # unique to avoid key collisions.
+    group_name = _process_group_name(my_group, use_hashed_name=True)
     split_pg = parent_pg.split_group(
         my_group,
         timeout=timeout,
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index 1175da3b91b7c..fc57640724243 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -750,8 +750,17 @@ def _record_worker_events(self, result: RunResult) -> None:
             failure = result.failures.get(worker.global_rank)
             state: str = self._get_worker_state(worker, result)
             raw_error = json.dumps(failure.error_file_data) if failure else None
+            exit_code = failure.exitcode if failure else None
+            worker_pid = failure.pid if failure else None
             record(
-                self._construct_event(state, EventSource.WORKER, worker, raw_error),
+                self._construct_event(
+                    state=state,
+                    source=EventSource.WORKER,
+                    worker=worker,
+                    raw_error=raw_error,
+                    exit_code=exit_code,
+                    worker_pid=worker_pid,
+                ),
                 self._worker_group.spec.event_log_handler,
             )
 
@@ -787,6 +796,8 @@ def _construct_event(
         worker: Optional[Worker] = None,
         raw_error: Optional[str] = None,
         duration_ms: Optional[float] = None,
+        exit_code: Optional[int] = None,
+        worker_pid: Optional[int] = None,
     ) -> Event:
         wg = self._worker_group
         spec = wg.spec
@@ -798,6 +809,8 @@ def _construct_event(
             md["local_rank"] = (worker.local_rank,)
             md["role_rank"] = (worker.role_rank,)
             md["role_world_size"] = (worker.role_world_size,)
+            md["exit_code"] = (exit_code,)
+            md["worker_pid"] = (worker_pid,)
             global_rank = worker.global_rank
             worker_id = str(worker.id)
         else:
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index ed3ea86b0f2aa..53e133523d452 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -477,11 +477,35 @@ def __init__(
     def start(self) -> None:
         """Start processes using parameters defined in the constructor."""
         if threading.current_thread() is threading.main_thread():
-            signal.signal(signal.SIGTERM, _terminate_process_handler)
-            signal.signal(signal.SIGINT, _terminate_process_handler)
-            if not IS_WINDOWS:
-                signal.signal(signal.SIGHUP, _terminate_process_handler)
-                signal.signal(signal.SIGQUIT, _terminate_process_handler)
+            # Register signal handlers for the signals specified in the environment variable
+            signals_to_handle = os.environ.get(
+                "TORCHELASTIC_SIGNALS_TO_HANDLE", "SIGTERM,SIGINT,SIGHUP,SIGQUIT"
+            )
+            signal_list = signals_to_handle.split(",")
+
+            for sig_name in signal_list:
+                try:
+                    sig = getattr(signal, sig_name.strip())
+                    signal.signal(sig, _terminate_process_handler)
+                    logger.info("Registered signal handler for %s", sig_name)
+                except (AttributeError, ValueError) as e:
+                    logger.warning(
+                        "Failed to register signal handler for %s: %s", sig_name, e
+                    )
+                except RuntimeError as e:
+                    if IS_WINDOWS and sig_name.strip() in [
+                        "SIGHUP",
+                        "SIGQUIT",
+                        "SIGUSR1",
+                        "SIGUSR2",
+                    ]:
+                        logger.info(
+                            "Signal %s is not supported on Windows, skipping", sig_name
+                        )
+                    else:
+                        logger.warning(
+                            "Failed to register signal handler for %s: %s", sig_name, e
+                        )
         else:
             logger.warning(
                 "Failed to register signal handlers since torchelastic is running on a child thread. "
@@ -851,8 +875,7 @@ def _start(self):
             for local_rank in range(self.nprocs)
         }
 
-    def _poll(self) -> Optional[RunProcsResult]:
-        done_local_ranks = set()
+    def _capture_process_failures(self, done_local_ranks: set[int]):
         for local_rank in self._running_local_ranks:
             handler = self.subprocess_handlers[local_rank]
             exitcode = handler.proc.poll()
@@ -867,11 +890,19 @@ def _poll(self) -> Optional[RunProcsResult]:
                     )
                 # else: --> succeeded; nothing to do
 
+    def _poll(self) -> Optional[RunProcsResult]:
+        done_local_ranks: set[int] = set()
+        self._capture_process_failures(done_local_ranks)
+
         self._running_local_ranks.difference_update(done_local_ranks)
 
         # if ALL procs are finished or ANY have failed
         if not self._running_local_ranks or self._failures:
             self.close()  # terminate all running procs
+            self._capture_process_failures(
+                done_local_ranks
+            )  # log sigterms and sigkill exit codes in the self._failures for bookkeeping purposes
+
             result = RunProcsResult(
                 failures=self._failures,
                 stdouts=self.stdouts,
diff --git a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
index 7183085b87042..982ff267a06a9 100644
--- a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
+++ b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
@@ -87,10 +87,7 @@ def set_state(
             if not isinstance(token, bytes):
                 result = self.get_state()
                 if result is not None:
-                    tmp = *result, False
-                    # Python 3.6 does not support tuple unpacking in return
-                    # statements.
-                    return tmp
+                    return *result, False
                 return None
 
             token = token.decode()
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
index 9ebb680bef17a..917309febe990 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
@@ -96,10 +96,7 @@ def set_state(
         def get_state():
             result = self.get_state()
             if result is not None:
-                tmp = *result, False
-                # Python 3.6 does not support tuple unpacking in return
-                # statements.
-                return tmp
+                return *result, False
             return None
 
         if token:
diff --git a/torch/distributed/elastic/rendezvous/registry.py b/torch/distributed/elastic/rendezvous/registry.py
index 75f0d16f7d195..ebada4623a814 100644
--- a/torch/distributed/elastic/rendezvous/registry.py
+++ b/torch/distributed/elastic/rendezvous/registry.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-import sys
+from importlib.metadata import entry_points
 
 from .api import (
     rendezvous_handler_registry as handler_registry,
@@ -15,11 +15,6 @@
 from .dynamic_rendezvous import create_handler
 
 
-if sys.version_info < (3, 10):
-    from importlib_metadata import entry_points
-else:
-    from importlib.metadata import entry_points
-
 log = logging.getLogger(__name__)
 
 __all__ = ["get_rendezvous_handler"]
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
index 90b4b91a5cc7a..ea624cb092bdf 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
@@ -472,6 +472,7 @@ def foreach_reduce(
     ``unsharded_grads`` owns the references to the gradients computed by
     autograd, so clearing the list frees the gradients.
     """
+
     grad_dtypes = {grad.dtype for grad in unsharded_grads}
     if len(grad_dtypes) != 1:
         # Check this at runtime since it could be a real runtime error if e.g.
@@ -492,14 +493,21 @@ def foreach_reduce(
         )
     )
     world_size = reduce_scatter_group.size()
-    for i, (fsdp_param, unsharded_grad) in enumerate(zip(fsdp_params, unsharded_grads)):
-        if (shard_dim := fsdp_param.fsdp_placement.dim) == 0:
-            continue
-        assert unsharded_grad.size(shard_dim) % world_size == 0, (
-            f"Shard({shard_dim}) requires even sharding: {unsharded_grad.size()=} {world_size=}"
-        )
-        chunks = torch.chunk(unsharded_grad, world_size, dim=shard_dim)
-        unsharded_grads[i] = torch.cat(chunks, dim=0)
+    device_handle = _get_device_handle(device.type)
+    current_stream = device_handle.current_stream()
+
+    if world_size > 1:
+        for i, (fsdp_param, unsharded_grad) in enumerate(
+            zip(fsdp_params, unsharded_grads)
+        ):
+            if (shard_dim := fsdp_param.fsdp_placement.dim) == 0:
+                continue
+            assert unsharded_grad.size(shard_dim) % world_size == 0, (
+                f"Shard({shard_dim}) requires even sharding: {unsharded_grad.size()=} {world_size=}"
+            )
+            chunks = torch.chunk(unsharded_grad, world_size, dim=shard_dim)
+            unsharded_grads[i] = torch.cat(chunks, dim=0)
+
     padded_unsharded_sizes = tuple(
         _get_dim0_padded_size(grad.size(), world_size) for grad in unsharded_grads
     )
@@ -510,14 +518,15 @@ def foreach_reduce(
         dtype=reduce_dtype,
         device=device,
     )
-    device_handle = _get_device_handle(device.type)
+
     foreach_reduce_scatter_copy_in(unsharded_grads, reduce_scatter_input, world_size)
-    current_stream = device_handle.current_stream()
+
     # Only after the copy-in finishes can we free the gradients
     unsharded_grads.clear()
     reduce_scatter_stream.wait_stream(current_stream)
     all_reduce_input = None
     all_reduce_event = None
+
     with device_handle.stream(reduce_scatter_stream):
         reduce_output = reduce_scatter_comm.allocate(
             (reduce_scatter_output_numel,),
@@ -525,12 +534,16 @@ def foreach_reduce(
             device=device,
         )
         _div_if_needed(reduce_scatter_input, predivide_factor)
-        reduce_scatter_comm(
-            output_tensor=reduce_output,
-            input_tensor=reduce_scatter_input,
-            group=reduce_scatter_group,
-            op=reduce_scatter_op,
-        )
+        if world_size > 1:
+            reduce_scatter_comm(
+                output_tensor=reduce_output,
+                input_tensor=reduce_scatter_input,
+                group=reduce_scatter_group,
+                op=reduce_scatter_op,
+            )
+        else:
+            # For single GPU, just copy the input to output (no actual reduce-scatter needed)
+            reduce_output.copy_(reduce_scatter_input)
         reduce_scatter_event = reduce_scatter_stream.record_event()
         post_reduce_stream = reduce_scatter_stream
         if all_reduce_group is not None:  # HSDP
@@ -551,7 +564,10 @@ def foreach_reduce(
             if partial_reduce_output is not None:
                 reduce_output += partial_reduce_output
             post_reduce_stream = all_reduce_stream
-            all_reduce_stream.wait_stream(reduce_scatter_stream)
+            if world_size >= 1:
+                all_reduce_stream.wait_stream(reduce_scatter_stream)
+            else:
+                all_reduce_stream.wait_stream(current_stream)
             with device_handle.stream(all_reduce_stream):
                 dist.all_reduce(
                     reduce_output,
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
index db8f2bf722f01..ee6f3299e988b 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
@@ -833,10 +833,24 @@ def reset_sharded_param(self):
         if local_tensor.is_meta:
             return
         updated_local_tensor = False
+        # local_tensor can be padded twice
+        # 1st time in fully_shard(model)
+        # 2nd time in model(input) lazy_init
+        # 2nd time should be no-op if parameters remain unchanged
+        # 2nd time shouldn't be no-op if people call model.load_state_dict(...) before lazy_init
+        # this makes it possible for trainer to call `sd = model.state_dict()` before the training loop
+        # and use `sd` without calling .state_dict() per iteration
+        same_local_tensor = False
+        # TODO: need to support tensor subclass
+        if type(self._sharded_param_data) is torch.Tensor:
+            same_local_tensor = (
+                self._sharded_param_data.untyped_storage().data_ptr()
+                == local_tensor.untyped_storage().data_ptr()
+            )
         padded_sharded_size = self.padded_sharded_param_size
         shard_dim = self.fsdp_placement.dim
         length = local_tensor.size(shard_dim) if local_tensor.numel() > 0 else 0
-        if local_tensor.size() != padded_sharded_size:
+        if local_tensor.size() != padded_sharded_size and not same_local_tensor:
             assert shard_dim == 0, (
                 f"Shard({shard_dim}) requires even sharding: {local_tensor.size()=}"
             )
@@ -849,7 +863,8 @@ def reset_sharded_param(self):
         if self.pin_memory and not local_tensor.is_pinned():
             local_tensor = local_tensor.cpu().pin_memory()
             updated_local_tensor = True
-        self._sharded_param_data = local_tensor.view(-1)
+        if not same_local_tensor:
+            self._sharded_param_data = local_tensor.view(-1)
         assert isinstance(self.sharded_param, DTensor)  # mypy
         if updated_local_tensor:
             # Only change the local tensor object if needed
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_state.py b/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
index 237f59673828a..e8ba77e8fa0ec 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
@@ -230,7 +230,7 @@ def _pre_forward(
         self, module: nn.Module, args: tuple[Any, ...], kwargs: dict[str, Any]
     ) -> tuple[tuple[Any, ...], dict[str, Any]]:
         # When composing with module-hook-based activation checkpointing, the
-        # the pre-backward hook is responsible for the unshard
+        # pre-backward hook is responsible for the unshard
         if self._training_state == TrainingState.PRE_BACKWARD:
             return args, kwargs
         self._training_state = TrainingState.FORWARD
diff --git a/torch/distributed/fsdp/_fully_shard/_fully_shard.py b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
index eb348a00f5f98..4b116e415042b 100644
--- a/torch/distributed/fsdp/_fully_shard/_fully_shard.py
+++ b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
@@ -45,12 +45,17 @@
     "FSDPModule",
     "UnshardHandle",
     "register_fsdp_forward_method",
+    "get_cls_to_fsdp_cls",
 ]
 
 
 cls_to_fsdp_cls: dict[type, type] = {}
 
 
+def get_cls_to_fsdp_cls() -> dict[type, type]:
+    return cls_to_fsdp_cls
+
+
 @overload
 def fully_shard(
     module: nn.Module,
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 671995671c75b..5fb52c7c281cc 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -426,7 +426,7 @@ def _flatten_optim_state_dict(
     Note that ``_flatten_tensor_optim_state`` does not need ``optim`` to
     flatten/shard the state. However, NamedOptimizer and KeyedOptimizer require
     all the states even if the corresponding parameters are empty. To this end,
-    ``optim`` will be used to to get the initial state of the empty parameters.
+    ``optim`` will be used to get the initial state of the empty parameters.
     ``optim`` should only be non-None if the ``optim` is KeyedOptimizer or
     NamedOptimizer.
 
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index acf23b27ca2a6..cde1dc1750d2a 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -6,6 +6,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import os
 import sys
 import uuid
 from dataclasses import dataclass, field
@@ -95,6 +96,7 @@ class LaunchConfig:
     local_addr: Optional[str] = None
     event_log_handler: str = "null"
     numa_options: Optional[NumaOptions] = None
+    signals_to_handle: str = "SIGTERM,SIGINT,SIGHUP,SIGQUIT"
 
     def __post_init__(self):
         default_timeout = 900
@@ -240,6 +242,7 @@ def launch_agent(
             "metrics_cfg": config.metrics_cfg,
             "event_log_handler": config.event_log_handler,
             "numa_options": config.numa_options,
+            "signals_to_handle": config.signals_to_handle,
         },
     )
 
@@ -255,6 +258,9 @@ def launch_agent(
 
     master_addr, master_port = _get_addr_and_port(rdzv_parameters)
 
+    # Set the signals to handle in the environment variable
+    os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = config.signals_to_handle
+
     spec = WorkerSpec(
         role=config.role,
         local_world_size=config.nproc_per_node,
diff --git a/torch/distributed/pipelining/_IR.py b/torch/distributed/pipelining/_IR.py
index 3dfb0fe25c4cd..ab648a97a7ee9 100644
--- a/torch/distributed/pipelining/_IR.py
+++ b/torch/distributed/pipelining/_IR.py
@@ -1002,9 +1002,7 @@ def _trace_with_export(
     ) -> ExportedProgram:
         logger.info("Tracing model ...")
         try:
-            ep = torch.export.export_for_training(
-                mod, example_args, example_kwargs, strict=True
-            )
+            ep = torch.export.export(mod, example_args, example_kwargs, strict=True)
         except Exception as e:
             raise RuntimeError(
                 "It seems that we cannot capture your model as a full graph. "
diff --git a/torch/distributed/pipelining/_schedule_visualizer.py b/torch/distributed/pipelining/_schedule_visualizer.py
index 81be2b178343b..1230adc35bde5 100644
--- a/torch/distributed/pipelining/_schedule_visualizer.py
+++ b/torch/distributed/pipelining/_schedule_visualizer.py
@@ -17,6 +17,7 @@
     _Action,
     _ComputationType,
     _PipelineSchedule,
+    _PipelineScheduleRuntime,
     get_schedule_class,
     PipelineScheduleMulti,
     PipelineScheduleSingle,
@@ -36,6 +37,7 @@ def get_schedule_ops(
     num_microbatches: int,
     num_stages_per_rank: Optional[int] = None,
     add_spacing: bool = False,
+    with_comms: bool = False,
 ) -> list[list[Optional[_Action]]]:
     """
     Get all actions for a given schedule, pp_degree, and num_microbatches. The actions are returned in a list of lists
@@ -43,6 +45,8 @@ def get_schedule_ops(
 
     The schedule can be specified as a string which is passed into get_schedule_class() or a _PipelineSchedule instance.
     """
+    if add_spacing and with_comms:
+        raise ValueError("Cannot add spacing and view comms at the same time")
 
     if isinstance(schedule, str):
         schedule_class = get_schedule_class(schedule)
@@ -78,11 +82,18 @@ def get_schedule_ops(
 
     # Instantiate the schedule class
     schedule_instance = schedule_class(stages, num_microbatches)
+    assert schedule_instance.pipeline_order is not None
 
     # Convert to List[List[_Action]]
-    all_actions = []
-    for rank in range(pp_degree):
-        all_actions.append(schedule_instance.pipeline_order[rank])
+    all_actions: list[list[Optional[_Action]]] = []
+    if with_comms:
+        runtime = _PipelineScheduleRuntime(stages, num_microbatches)
+        runtime._prepare_schedule_with_comms(schedule_instance.pipeline_order)
+        for rank in range(pp_degree):
+            all_actions.append(list(runtime.pipeline_order_with_comms[rank]))
+    else:
+        for rank in range(pp_degree):
+            all_actions.append(schedule_instance.pipeline_order[rank])
 
     # Add spacing
     if add_spacing:
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 2738191f0e379..ba8fd3dc69e58 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -77,7 +77,9 @@
 .. note:: ``--nproc-per-node`` may be
           ``"gpu"`` (spawn one process per GPU),
           ``"cpu"`` (spawn one process per CPU),
+          ``"xpu"`` (spawn one process per XPU),
           ``"auto"`` (equivalent to ``"gpu"`` if CUDA is available,
+          else equivalent to ``"xpu"`` if XPU is available,
           else equivalent to ``"cpu"``),
           or an integer specifying the number of processes.
           See `torch.distributed.run.determine_local_world_size
@@ -413,7 +415,7 @@ def get_args_parser() -> ArgumentParser:
         action=env,
         type=str,
         default="1",
-        help="Number of workers per node; supported values: [auto, cpu, gpu, int].",
+        help="Number of workers per node; supported values: [auto, cpu, gpu, xpu, int].",
     )
 
     #
@@ -645,6 +647,17 @@ def get_args_parser() -> ArgumentParser:
           featuring a single L3 cache per socket.""",
     )
 
+    parser.add_argument(
+        "--signals-to-handle",
+        "--signals_to_handle",
+        action=env,
+        type=str,
+        default="SIGTERM,SIGINT,SIGHUP,SIGQUIT",
+        help="Comma-separated list of signals to handle and forward to subprocesses. "
+        "Default: SIGTERM,SIGINT,SIGHUP,SIGQUIT. "
+        "Common additional signals: SIGUSR1,SIGUSR2 (used in SLURM environments).",
+    )
+
     #
     # Positional arguments.
     #
@@ -694,21 +707,20 @@ def determine_local_world_size(nproc_per_node: str):
                 raise ValueError("Cuda is not available.") from e
             device_type = "gpu"
             num_proc = torch.cuda.device_count()
+        elif nproc_per_node == "xpu":
+            if not torch.xpu.is_available():
+                raise ValueError("Xpu is not available.") from e
+            device_type = "xpu"
+            num_proc = torch.xpu.device_count()
         elif nproc_per_node == torch._C._get_privateuse1_backend_name():
             if not _get_custom_mod_func("is_available")():
                 raise ValueError(f"{nproc_per_node} is not available.") from e
             device_type = nproc_per_node
             num_proc = _get_custom_mod_func("device_count")()
         elif nproc_per_node == "auto":
-            if torch.cuda.is_available():
-                num_proc = torch.cuda.device_count()
-                device_type = "gpu"
-            elif (
-                hasattr(torch, torch._C._get_privateuse1_backend_name())
-                and _get_custom_mod_func("is_available")()
-            ):
-                num_proc = _get_custom_mod_func("device_count")()
-                device_type = torch._C._get_privateuse1_backend_name()
+            if torch.accelerator.is_available():
+                num_proc = torch.accelerator.device_count()
+                device_type = torch.accelerator.current_accelerator().type  # type: ignore[union-attr]
             else:
                 num_proc = os.cpu_count()
                 device_type = "cpu"
@@ -861,6 +873,7 @@ def config_from_args(args) -> tuple[LaunchConfig, Union[Callable, str], list[str
         logs_specs=logs_specs,
         event_log_handler=args.event_log_handler,
         numa_options=numa_options,
+        signals_to_handle=args.signals_to_handle,
     )
 
     with_python = not args.no_python
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
index 38a46ed5f03c5..d5b07b659d6f7 100644
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@@ -240,8 +240,8 @@ class DTensor(torch.Tensor):
     # _op_dispatcher instance as a class attribute to handle runtime dispatching logic
     _op_dispatcher: op_dispatch.OpDispatcher = op_dispatch.OpDispatcher()
 
-    @staticmethod
-    @torch._disable_dynamo
+    # This implementation is just to convince mypy _spec and _local_tensor are
+    # initialized; it is immediately overridden below.
     def __new__(
         cls,
         local_tensor: torch.Tensor,
@@ -249,10 +249,21 @@ def __new__(
         *,
         requires_grad: bool,
     ) -> "DTensor":
+        r = torch.Tensor._dtensor__new__(
+            cls, local_tensor, spec, requires_grad=requires_grad
+        )
+        r._spec = spec
+        r._local_tensor = local_tensor
+        return r
+
+    __new__ = torch.Tensor._dtensor__new__  # type: ignore[assignment] # noqa: F811
+
+    @torch._disable_dynamo
+    @mark_subclass_constructor_exportable_experimental
+    def __init__(self, *args, **kwargs):
         """
         Construct a DTensor from a local tensor, device mesh, and placement and
         other tensor properties (i.e. shape, requires_grad, strides, etc).
-
         .. note:: This is not a public API and it's only supposed to be used by the
             operator implementations and internals. If you want to construct a
             DTensor from a local tensor, consider using ``DTensor.from_local``, if
@@ -260,31 +271,6 @@ def __new__(
             already have tensor initialized and want to shard this tensor),
             consider using ``distribute_tensor``.
         """
-        if local_tensor.requires_grad and not requires_grad:
-            warnings.warn(
-                "To construct DTensor from torch.Tensor, it's recommended to "
-                "use local_tensor.detach() and make requires_grad consistent."
-            )
-
-        # new method instruct wrapper tensor from local_tensor and add
-        # placement spec, it does not do actual distribution
-        assert spec.tensor_meta is not None, "TensorMeta should not be None!"
-
-        r = torch.Tensor._make_dtensor(
-            cls,
-            spec.tensor_meta.shape,
-            spec.tensor_meta.stride,
-            local_tensor,
-            requires_grad,
-        )
-
-        r._spec = spec
-        r._local_tensor = local_tensor
-        return r
-
-    @torch._disable_dynamo
-    @mark_subclass_constructor_exportable_experimental
-    def __init__(self, *args, **kwargs):
         super().__init__()
 
     # pyre-fixme[14]: `__repr__` overrides method defined in `DTensor` inconsistently.
@@ -346,12 +332,6 @@ def __coerce_same_metadata_as_tangent__(self, flatten_spec, expected_type=None):
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
-        # These are all ops that can show up in AccumulateGrad,
-        # which is susceptible to DTensor overheads
-        if func is torch.ops.aten.detach.default:
-            return DTensor(
-                args[0]._local_tensor.detach(), args[0]._spec, requires_grad=False
-            )
         return DTensor._op_dispatcher.dispatch(
             func,
             args,
diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py
index 7ac7801b50bca..52a0ba15a364a 100644
--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@@ -23,6 +23,7 @@
 )
 from torch.distributed.tensor._utils import try_find_mesh_from_args
 from torch.distributed.tensor.placement_types import Partial, Placement, Replicate
+from torch.utils._debug_mode import get_active_debug_mode
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
 
@@ -150,7 +151,6 @@ def dispatch(
 
         # extract local tensor and sharding infos to a OpInfo
         op_info = self.unwrap_to_op_info(op_call, args, kwargs)
-        logger.debug("Dispatching op_call: %s", op_info.schema)
 
         try:
             self.sharding_propagator.propagate(op_info)
@@ -171,7 +171,6 @@ def dispatch(
             ) from e
 
         output_sharding = op_info.output_sharding
-        logger.debug("output_sharding for %s: %s", op_call, output_sharding)
         assert output_sharding is not None, "output sharding should not be None"
 
         mesh = op_info.compute_mesh
@@ -336,6 +335,8 @@ def redistribute_local_args(
         suggested_input_schema: OpSchema,
         use_val_from_redistribute_schema: bool,
     ) -> None:
+        debug_mode = get_active_debug_mode()
+
         # NOTE: it's very rare that we need to reshard kwargs so we intentionally skip it
         if op_info.args_tree_spec is not None:
             flatten_args_schema_to_reshard = tuple(
@@ -350,9 +351,18 @@ def redistribute_local_args(
             if isinstance(arg_spec, DTensorSpec):
                 local_tensor = cast(torch.Tensor, op_info.local_args[i])
                 if arg_spec != reshard_arg_spec:
-                    resharded_local_tensor = redistribute_local_tensor(
-                        local_tensor, arg_spec, reshard_arg_spec
+                    redistribute_context = (
+                        debug_mode.record_redistribute_calls(  # type: ignore[union-attr]
+                            i, arg_spec, reshard_arg_spec
+                        )
+                        if debug_mode is not None
+                        else contextlib.nullcontext()
                     )
+
+                    with redistribute_context:
+                        resharded_local_tensor = redistribute_local_tensor(
+                            local_tensor, arg_spec, reshard_arg_spec
+                        )
                     new_local_args.append(resharded_local_tensor)
                 else:
                     new_local_args.append(local_tensor)
@@ -507,5 +517,7 @@ def _try_replicate_spec_for_scalar_tensor(
             raise RuntimeError(
                 f"{op_call}: got mixed torch.Tensor and DTensor, need to convert all"
                 " torch.Tensor to DTensor before calling distributed operators!"
+                " Please see https://docs.pytorch.org/docs/main/distributed.tensor.html#mixed-tensor-and-dtensor-operations"
+                " for more details."
             )
         return replication_spec
diff --git a/torch/distributed/tensor/_op_schema.py b/torch/distributed/tensor/_op_schema.py
index 6f8c644095eec..621fd547e1df5 100644
--- a/torch/distributed/tensor/_op_schema.py
+++ b/torch/distributed/tensor/_op_schema.py
@@ -24,12 +24,16 @@
 """
 
 from collections.abc import Sequence
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import cached_property
 from typing import Any, Optional, Union
 from typing_extensions import deprecated
 
 import torch
+from torch._C import (
+    _DTensor_OpSchema_post_init,
+    _DTensor_OpSchema_recompute_comparison_key,
+)
 from torch._ops import OpOverload
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
@@ -331,6 +335,8 @@ class OpSchema:
 
     _comparison_key: Optional[tuple[object, ...]] = None
 
+    has_symints: bool = field(init=False)
+
     @property
     def args_spec(self) -> tuple[DTensorSpec, ...]:
         """
@@ -386,14 +392,7 @@ def __str__(self) -> str:
         return f"Op(op={self.op}, args_schema={', '.join(args_schema)} @ mesh: {mesh_shape})"
 
     def __post_init__(self) -> None:
-        has_symints = False
-        for a in self.args_schema:
-            if isinstance(a, DTensorSpec) and a.tensor_meta is not None:
-                if any(isinstance(s, torch.SymInt) for s in a.tensor_meta.shape):
-                    has_symints = True
-                    break
-        self.has_symints = has_symints
-        self._recompute_comparison_key()
+        _DTensor_OpSchema_post_init(self)
 
     def arg_type_tensor_or_tensor_list_like(self, arg: object) -> bool:
         is_tensor = isinstance(arg, DTensorSpec)
@@ -479,26 +478,8 @@ def is_out_variant_op(self) -> bool:
     def is_view_op(self) -> bool:
         return self.op._schema._is_view_op()
 
-    def _recompute_comparison_key(self):
-        if not self.schema_info:
-            static_argnum = len(self.args_schema)
-            static_kwargkey = None
-        else:
-            static_argnum = self.schema_info.static_argnum
-            static_kwargkey = self.schema_info.static_kwargkey
-
-        args_to_hash = tuple(
-            tuple(e) if isinstance(e, list) else e
-            for i, e in enumerate(self.args_schema)
-            if self.arg_type_tensor_or_tensor_list_like(e) or i >= static_argnum
-        )
-        if static_kwargkey is not None:
-            kwargs_to_hash = tuple(
-                self.kwargs_schema.get(k, None) for k in static_kwargkey
-            )
-            self._comparison_key = (self.op, args_to_hash, kwargs_to_hash)
-        else:
-            self._comparison_key = (self.op, args_to_hash)
+    def _recompute_comparison_key(self) -> None:
+        _DTensor_OpSchema_recompute_comparison_key(self)
 
     def __hash__(self) -> int:
         return hash(self._comparison_key)
diff --git a/torch/distributed/tensor/_ops/_math_ops.py b/torch/distributed/tensor/_ops/_math_ops.py
index 1e6eb40939e4a..b43a403eae2a4 100644
--- a/torch/distributed/tensor/_ops/_math_ops.py
+++ b/torch/distributed/tensor/_ops/_math_ops.py
@@ -22,6 +22,7 @@
     expand_to_full_mesh_op_strategy,
     generate_redistribute_costs,
     is_tensor_evenly_shardable,
+    is_tensor_evenly_shardable_on_dim,
     normalize_dim,
     normalize_dims,
     register_op_strategy,
@@ -268,6 +269,15 @@ def common_reduction_strategy(
     reduction_strategy = OpStrategy([])
 
     for op_spec in input_strategy.strategies:
+        if reduction_op == "avg":
+            output_spec = op_spec.output_spec
+            local_shape = list(output_spec.tensor_meta.shape)  # type:ignore[union-attr]
+            for dim in reduce_dims:
+                if not is_tensor_evenly_shardable_on_dim(local_shape, output_spec, dim):
+                    # reduce(avg) is not linear for unevenly sharded tensors
+                    reduction_linear = False
+                    break
+
         if not reduction_linear:
             # input placements for this strategy should clear out pending sum and sharding
             # on the reduction dimension
@@ -310,6 +320,7 @@ def common_reduction_strategy(
     aten.prod.default: "product",
     aten.prod.dim_int: "product",
     aten.prod.int_out: "product",
+    # avg is only linear when there is no padding
     aten.mean.default: "avg",
     aten.mean.dim: "avg",
     aten.mean.out: "avg",
diff --git a/torch/distributed/tensor/_ops/_pointwise_ops.py b/torch/distributed/tensor/_ops/_pointwise_ops.py
index 46fc8fbc0d990..084fa62706e0d 100644
--- a/torch/distributed/tensor/_ops/_pointwise_ops.py
+++ b/torch/distributed/tensor/_ops/_pointwise_ops.py
@@ -421,6 +421,7 @@
     aten.mul_.Scalar: 0,
     aten.mul.Tensor: 2,
     aten.mul_.Tensor: 2,
+    aten.copy_.default: 1,
 }
 
 
@@ -659,6 +660,8 @@ def common_pointwise_strategy(
     aten._foreach_mul_.ScalarList,
     aten._foreach_mul_.Tensor,
     aten._foreach_mul_.List,
+    aten._foreach_pow.List,
+    aten._foreach_pow.ScalarList,
     aten._foreach_neg.default,
     aten._foreach_neg_.default,
     aten._foreach_reciprocal_.default,
@@ -748,9 +751,9 @@ def args_tuple_strategies(
             args_schema,
             child_strtgy,
             linearity,
-            scalar_tensor_idx=_FUSED_OP_SCALAR_IDX
-            if op_schema.op in fused_ops
-            else None,
+            scalar_tensor_idx=(
+                _FUSED_OP_SCALAR_IDX if op_schema.op in fused_ops else None
+            ),
         )
         list_strategy.append(pointwise_strategy)
     return TupleStrategy(list_strategy)
diff --git a/torch/distributed/tensor/_ops/_tensor_ops.py b/torch/distributed/tensor/_ops/_tensor_ops.py
index a5a037a3c73e6..a94c68c58739d 100644
--- a/torch/distributed/tensor/_ops/_tensor_ops.py
+++ b/torch/distributed/tensor/_ops/_tensor_ops.py
@@ -27,6 +27,8 @@
     normalize_dim,
     register_op_strategy,
     register_prop_rule,
+    shift_shard_dims_after_insert,
+    shift_shard_dims_after_remove,
 )
 from torch.distributed.tensor.placement_types import (
     Partial,
@@ -35,8 +37,6 @@
     Shard,
 )
 
-from ._pointwise_ops import pointwise_strategy
-
 
 aten = torch.ops.aten
 
@@ -93,21 +93,6 @@ def propagate_single_input_strategy(op_schema: OpSchema) -> StrategyType:
     aten._to_copy.default, schema_info=RuntimeSchemaInfo(static_kwargkey=["dtype"])
 )(propagate_single_input_strategy)
 
-# copy_ is actually a pointwise op with broadcasting, so reuse the pointwise strategy, which takes care of these
-# requirements.
-#
-# Following torch broadcasting semantics (https://docs.pytorch.org/docs/stable/notes/broadcasting.html)
-# - self can not change shape as a result of broadcasting since this is an inplace op
-# - src can broadcast, but when it does it always does so from the trailing end
-# e.g. the last dim of 'src' must match up with the last dim of 'self'
-#
-# DTensor semantics for inplace ops also dictates that we may NOT redistribute our 'self' input.
-# In practice, what this means is
-# - our output strategies should map 1:1 to our 'self' input strategies
-# - our 'src' input may be redistributed to match up with the 'self' input, with the caveat of adjusting for
-#   broadcasting dim
-register_op_strategy(aten.copy_.default)(pointwise_strategy)
-
 
 @register_op_strategy(
     [
@@ -326,16 +311,11 @@ def select_int_strategy(op_schema: OpSchema) -> StrategyType:
         output_specs = input_specs
         if input_specs.is_sharded():
             # handle cases with sharded_dim != selected_dim
-            output_spec_placements = []
-            for placement in input_specs.placements:
-                if placement.is_shard():
-                    shard_dim = cast(Shard, placement).dim
-                    if shard_dim > selected_dim:
-                        shard_dim -= 1
-                    placement = Shard(dim=shard_dim)
-                output_spec_placements.append(placement)
+            output_placements = shift_shard_dims_after_remove(
+                input_specs.placements, selected_dim
+            )
             output_specs = DTensorSpec(
-                arg_spec.mesh, placements=tuple(output_spec_placements)
+                arg_spec.mesh, placements=tuple(output_placements)
             )
 
         select_strategy.strategies.append(
@@ -360,19 +340,10 @@ def select_backward_strategy(op_schema: OpSchema) -> OpStrategy:
     output_strategies: list[OpSpec] = []
     for placement_strategy in input_strategy.strategies:
         input_spec = placement_strategy.output_spec
-        output_spec_placements: list[Placement] = []
-        for placement in input_spec.placements:
-            if isinstance(placement, Shard):
-                shard_dim = placement.dim
-                if shard_dim >= dim:
-                    # NOTE: shard_dim is guaranteed to exist because
-                    # grad_input has one more dim than grad_output
-                    output_spec_placements.append(Shard(shard_dim + 1))
-                else:
-                    output_spec_placements.append(Shard(shard_dim))
-            else:
-                output_spec_placements.append(placement)
-        output_specs = DTensorSpec(input_spec.mesh, tuple(output_spec_placements))
+        # NOTE: shard_dim is guaranteed to exist because
+        # grad_input has one more dim than grad_output
+        output_placements = shift_shard_dims_after_insert(input_spec.placements, dim)
+        output_specs = DTensorSpec(input_spec.mesh, tuple(output_placements))
         output_strategies.append(
             OpSpec(output_specs=output_specs, input_specs=(input_spec,))
         )
@@ -741,20 +712,6 @@ def merge_placement(
     return follow_placements
 
 
-def normalize_shard_for_stack(
-    placements: Sequence[Placement], insert_dim: int = 0
-) -> Sequence[Placement]:
-    # stack op would "insert" new dim, so all sharded dim >= the inserted dim need to
-    # be normalized with the new Shard placement
-    normalized_placements: list[Placement] = []
-    for placement in placements:
-        if isinstance(placement, Shard) and placement.dim >= insert_dim:
-            normalized_placements.append(Shard(placement.dim + 1))
-        else:
-            normalized_placements.append(placement)
-    return normalized_placements
-
-
 @register_op_strategy(aten.stack.default, RuntimeSchemaInfo(1, needs_pytree=True))
 def stack_strategy(op_schema: OpSchema) -> StrategyType:
     args_schema = op_schema.args_schema
@@ -781,7 +738,9 @@ def stack_strategy(op_schema: OpSchema) -> StrategyType:
         for _ in range(len(input_tuple_strategy.children))
     )
 
-    follow_placements = normalize_shard_for_stack(follow_placements, dim)
+    # stack op would "insert" new dim, so all sharded dim >= the inserted dim need to
+    # be normalized with the new Shard placement
+    follow_placements = shift_shard_dims_after_insert(follow_placements, dim)
 
     for strategy in input_tuple_strategy.children:
         assert isinstance(strategy, OpStrategy)
@@ -1184,3 +1143,45 @@ def size_split(N, i) -> list:
         )
 
     return OpStrategy(all_strategies)
+
+
+# TODO: fix remaining failures in xfail("unbind") in test_dtensor_ops.py
+#       and remove this xfail item
+@register_op_strategy(aten.unbind.int, schema_info=RuntimeSchemaInfo(1))
+def gen_unbind_strategy(op_schema: OpSchema) -> StrategyType:
+    """Forward all shardings except the unbind dimension."""
+    input_strategy = op_schema.args_schema[0]
+    assert isinstance(input_strategy, OpStrategy)
+    input_ndim = input_strategy.ndim
+    input_shape = input_strategy.shape
+    unbind_dim = (
+        cast(int, op_schema.args_schema[1]) if len(op_schema.args_schema) > 1 else 0
+    )
+    unbind_dim = normalize_dim(unbind_dim, input_ndim)
+
+    mesh = input_strategy.mesh
+    unbind_strategy = OpStrategy([])
+    for arg_strategy in input_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+        if is_tensor_dim_sharded(arg_spec, dim=unbind_dim):
+            raise RuntimeError(
+                f"Attempted to unbind along the sharded dimension {unbind_dim}. ",
+                "It cannot be performed without redistribution, which is disallowed "
+                "by the current operator.",
+            )
+        # only add the strategy if the unbind dim is not sharded
+        output_placements = shift_shard_dims_after_remove(
+            arg_spec.placements, unbind_dim
+        )
+        output_specs = tuple(
+            DTensorSpec(mesh, tuple(output_placements))
+            for _ in range(input_shape[unbind_dim])
+        )
+        unbind_strategy.strategies.append(
+            OpSpec(
+                output_specs=output_specs,
+                input_specs=(arg_spec,),
+                redistribute_cost=[[0.0] * len(input_strategy.strategies)],
+            )
+        )
+    return unbind_strategy
diff --git a/torch/distributed/tensor/_ops/_view_ops.py b/torch/distributed/tensor/_ops/_view_ops.py
index 62e8c68e9be9d..80a0491f694cc 100644
--- a/torch/distributed/tensor/_ops/_view_ops.py
+++ b/torch/distributed/tensor/_ops/_view_ops.py
@@ -490,7 +490,9 @@ def propagate_shape_and_sharding(
     - An output dimension that is a split of the input dimension can only be sharded
       if the leftmost split size is divisible by the mesh dimension
     """
-    assert len(input_src_placements) == len(mesh_sizes)
+    assert len(input_src_placements) == len(mesh_sizes), (
+        f"{input_src_placements} != {mesh_sizes}"
+    )
     # for each input dim, for each mesh dim, provides a list of possible shardable dimensions
     mesh_ndim = len(mesh_sizes)
     shardable_dims: dict[int, list[bool]] = {}
diff --git a/torch/distributed/tensor/_ops/utils.py b/torch/distributed/tensor/_ops/utils.py
index fb6f8a8ba8108..1e6540a82c02e 100644
--- a/torch/distributed/tensor/_ops/utils.py
+++ b/torch/distributed/tensor/_ops/utils.py
@@ -194,6 +194,22 @@ def is_tensor_evenly_shardable(shape: Sequence[int], spec: DTensorSpec) -> bool:
     return True
 
 
+def is_tensor_evenly_shardable_on_dim(
+    shape: Sequence[int], spec: DTensorSpec, dim: int
+) -> bool:
+    """Check if the shape is evenly shardable according to the spec on dim."""
+    dim = normalize_dim(dim, len(shape))
+
+    num_shards = 1
+    for i, placement in enumerate(spec.placements):
+        if placement.is_shard():
+            shard_dim = cast(Shard, placement).dim
+            if shard_dim == dim:
+                num_shards *= spec.mesh.size(i)
+
+    return shape[dim] % num_shards == 0
+
+
 def is_tensor_dim_sharded(spec: DTensorSpec, dim: int) -> bool:
     """Return True if tensor dim is sharded."""
     return any(p.is_shard(dim) for p in spec.placements)
@@ -370,3 +386,27 @@ def expand_to_full_mesh_op_strategy(
         )
         all_strategies.append(strategy)
     return OpStrategy(all_strategies)
+
+
+def shift_shard_dims_after_insert(
+    placements: Sequence[Placement], insert_dim: int = 0
+) -> Sequence[Placement]:
+    normalized_placements: list[Placement] = []
+    for placement in placements:
+        if isinstance(placement, Shard) and placement.dim >= insert_dim:
+            normalized_placements.append(Shard(placement.dim + 1))
+        else:
+            normalized_placements.append(placement)
+    return normalized_placements
+
+
+def shift_shard_dims_after_remove(
+    placements: Sequence[Placement], remove_dim: int = 0
+) -> Sequence[Placement]:
+    normalized_placements: list[Placement] = []
+    for placement in placements:
+        if isinstance(placement, Shard) and placement.dim > remove_dim:
+            normalized_placements.append(Shard(placement.dim - 1))
+        else:
+            normalized_placements.append(placement)
+    return normalized_placements
diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py
index dc3a1fb10e4b3..68a3fe3f329a3 100644
--- a/torch/distributed/tensor/_random.py
+++ b/torch/distributed/tensor/_random.py
@@ -82,7 +82,7 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
     #     "DTensor manual_seed() is deprecated, since DTensor no longer maintains a separate copy of generator state. "
     #     "Use `torch.manual_seed` instead"
     # )
-    # Note: we still need to ensure setting `run_state_sync=False` to support the the pp case
+    # Note: we still need to ensure setting `run_state_sync=False` to support the pp case
 
     # instantiate a RNG tracker if haven't. By default DTensor uses an
     # OffsetBasedRNGTracker to perform random operators.
diff --git a/torch/distributed/tensor/_redistribute.py b/torch/distributed/tensor/_redistribute.py
index 54d8723b92f89..1d9e9975bc65f 100644
--- a/torch/distributed/tensor/_redistribute.py
+++ b/torch/distributed/tensor/_redistribute.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
+import contextlib
 import logging
 from functools import cache
 from typing import cast, NamedTuple, Optional
@@ -16,6 +17,7 @@
     Replicate,
     Shard,
 )
+from torch.utils._debug_mode import get_active_debug_mode
 
 
 logger = logging.getLogger(__name__)
@@ -187,94 +189,106 @@ def redistribute_local_tensor(
     else:
         transform_infos = _gen_transform_infos(current_spec, target_spec)
 
-    for transform_info in transform_infos:
-        i = transform_info.mesh_dim
-        current, target = transform_info.src_dst_placements
-        device_mesh.size(mesh_dim=i)
-
-        if current == target:
-            # short cut, just use the original local tensor
-            new_local_tensor = local_tensor
-            continue
+    debug_mode = get_active_debug_mode()
+    redistribute_context = (
+        debug_mode.record_redistribute_calls(  # type: ignore[union-attr]
+            local_tensor, current_spec, target_spec
+        )
+        if debug_mode is not None
+        else contextlib.nullcontext()
+    )
+
+    with redistribute_context:
+        for transform_info in transform_infos:
+            i = transform_info.mesh_dim
+            current, target = transform_info.src_dst_placements
+            device_mesh.size(mesh_dim=i)
+
+            if current == target:
+                # short cut, just use the original local tensor
+                new_local_tensor = local_tensor
+                continue
 
-        logger.debug("redistribute from %s to %s on mesh dim %s", current, target, i)
+            logger.debug(
+                "redistribute from %s to %s on mesh dim %s", current, target, i
+            )
 
-        if target.is_replicate():
-            # Case 1: target is Replicate
-            if current.is_partial():
-                partial_spec = cast(Partial, current)
-                new_local_tensor = partial_spec._reduce_value(
-                    local_tensor, device_mesh, i
-                )
-            elif current.is_shard():
-                current_placement = cast(Shard, current)
-                new_local_tensor = current_placement._to_replicate_tensor(
-                    local_tensor, device_mesh, i, transform_info.logical_shape
-                )
-            else:
-                raise RuntimeError(
-                    f"redistribute from {current} to {target} not supported yet"
-                )
-        elif target.is_shard():
-            # Case 2: target is Shard
-            target_placement = cast(Shard, target)
-            if current.is_partial():
-                partial_spec = cast(Partial, current)
-                new_local_tensor = partial_spec._reduce_shard_value(
-                    local_tensor, device_mesh, i, target_placement
-                )
-            elif current.is_replicate():
-                # split the tensor and return the corresponding cloned local shard
-                new_local_tensor = target_placement._replicate_to_shard(
-                    local_tensor, device_mesh, i, my_coordinate[i]
-                )
-            else:
-                assert current.is_shard(), (
-                    f"Current placement should be shard but found {current}"
-                )
-                shard_spec = cast(Shard, current)
-                if shard_spec.dim != target_placement.dim:
-                    new_local_tensor = shard_spec._to_new_shard_dim(
-                        local_tensor,
-                        device_mesh,
-                        i,
-                        transform_info.logical_shape,
-                        target_placement.dim,
+            if target.is_replicate():
+                # Case 1: target is Replicate
+                if current.is_partial():
+                    partial_spec = cast(Partial, current)
+                    new_local_tensor = partial_spec._reduce_value(
+                        local_tensor, device_mesh, i
                     )
-        elif target.is_partial():
-            if current.is_replicate():
-                partial_spec = cast(Partial, target)
-                # skip the replicate to partial transformation when we are in backward pass
-                # In this case we keep the grad as replicate, this is because we don't
-                # want to convert the replicated gradients back to partial, although
-                # that's logically conform with the same layout, converting the gradients
-                # back to partial is actually useless as you would have to do reduce later
-                # which would be more expensive than keeping it replicate! For this reason,
-                # we keep the replicate grad here.
-                new_local_tensor = (
-                    partial_spec._partition_value(local_tensor, device_mesh, i)
-                    if not is_backward
-                    else local_tensor
-                )
-            elif current.is_shard():
-                if not is_backward:
+                elif current.is_shard():
+                    current_placement = cast(Shard, current)
+                    new_local_tensor = current_placement._to_replicate_tensor(
+                        local_tensor, device_mesh, i, transform_info.logical_shape
+                    )
+                else:
                     raise RuntimeError(
                         f"redistribute from {current} to {target} not supported yet"
                     )
-                # for backward shard -> partial, we just need to convert the shard to replicate
-                current_placement = cast(Shard, current)
-                new_local_tensor = current_placement._to_replicate_tensor(
-                    local_tensor, device_mesh, i, transform_info.logical_shape
-                )
-            else:
-                # partial -> partial no op, should never hit
-                new_local_tensor = local_tensor
-
-        local_tensor = new_local_tensor
-
-    if not async_op and isinstance(new_local_tensor, funcol.AsyncCollectiveTensor):
-        new_local_tensor = new_local_tensor.wait()
-
+            elif target.is_shard():
+                # Case 2: target is Shard
+                target_placement = cast(Shard, target)
+                if current.is_partial():
+                    partial_spec = cast(Partial, current)
+                    new_local_tensor = partial_spec._reduce_shard_value(
+                        local_tensor, device_mesh, i, target_placement
+                    )
+                elif current.is_replicate():
+                    # split the tensor and return the corresponding cloned local shard
+                    new_local_tensor = target_placement._replicate_to_shard(
+                        local_tensor, device_mesh, i, my_coordinate[i]
+                    )
+                else:
+                    assert current.is_shard(), (
+                        f"Current placement should be shard but found {current}"
+                    )
+                    shard_spec = cast(Shard, current)
+                    if shard_spec.dim != target_placement.dim:
+                        new_local_tensor = shard_spec._to_new_shard_dim(
+                            local_tensor,
+                            device_mesh,
+                            i,
+                            transform_info.logical_shape,
+                            target_placement.dim,
+                        )
+            elif target.is_partial():
+                if current.is_replicate():
+                    partial_spec = cast(Partial, target)
+                    # skip the replicate to partial transformation when we are in backward pass
+                    # In this case we keep the grad as replicate, this is because we don't
+                    # want to convert the replicated gradients back to partial, although
+                    # that's logically conform with the same layout, converting the gradients
+                    # back to partial is actually useless as you would have to do reduce later
+                    # which would be more expensive than keeping it replicate! For this reason,
+                    # we keep the replicate grad here.
+                    new_local_tensor = (
+                        partial_spec._partition_value(local_tensor, device_mesh, i)
+                        if not is_backward
+                        else local_tensor
+                    )
+                elif current.is_shard():
+                    if not is_backward:
+                        raise RuntimeError(
+                            f"redistribute from {current} to {target} not supported yet"
+                        )
+                    # for backward shard -> partial, we just need to convert the shard to replicate
+                    current_placement = cast(Shard, current)
+                    new_local_tensor = current_placement._to_replicate_tensor(
+                        local_tensor, device_mesh, i, transform_info.logical_shape
+                    )
+                else:
+                    # partial -> partial no op, should never hit
+                    new_local_tensor = local_tensor
+
+            if not async_op and isinstance(
+                new_local_tensor, funcol.AsyncCollectiveTensor
+            ):
+                new_local_tensor = new_local_tensor.wait()
+            local_tensor = new_local_tensor
     return new_local_tensor
 
 
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index cd5452a1e9c01..a96a4dc756ac0 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -161,9 +161,12 @@ def _propagate_tensor_meta_non_cached(
             # data dependent ops can't be used for fake propagation
             return None
 
-        # NOTE: We must call the tracing in fake tensor mode so that it
-        # avoids materializing memory
-        with FakeTensorMode():
+        # NOTE: We must call the tracing in fake tensor mode so that it avoids
+        # materializing memory. Also disable the proxy mode tracing to prevent
+        # these operators to be inserted in the fx graph.
+        from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing
+
+        with FakeTensorMode(), disable_proxy_modes_tracing():
             fake_args = op_schema.gen_fake_args()
             fake_kwargs = op_schema.gen_fake_kwargs()
             fake_out = op_schema.op(*fake_args, **fake_kwargs)
@@ -218,12 +221,12 @@ def propagate_tensor_meta(
         else:
             return self._propagate_tensor_meta(op_schema)
 
-    def _wrap_output_spec_tensor_meta(
+    def _create_output_spec_with_new_tensor_meta(
         self,
         op: OpOverload,
         output_specs: OutputSpecType,
         output_tensor_meta: Union[None, TensorMeta, Sequence[Optional[TensorMeta]]],
-    ) -> None:
+    ) -> OutputSpecType:
         """
         Wrap the output_specs with the tensor metadata from the output.
         """
@@ -241,8 +244,9 @@ def _wrap_output_spec_tensor_meta(
                     "not equal the "
                     f"number of op outputs: {len(output_tensor_meta)}."
                 )
-            output_specs.tensor_meta = output_tensor_meta
+            return output_specs.shallow_copy_with_tensor_meta(output_tensor_meta)
         elif isinstance(output_specs, (tuple, list)):
+            new_specs: list[Optional[DTensorSpec]] = []
             if not isinstance(output_tensor_meta, (tuple, list)) or len(
                 output_specs
             ) != len(output_tensor_meta):
@@ -268,7 +272,7 @@ def _wrap_output_spec_tensor_meta(
                             and output_tensor_meta_i is None
                         ):
                             assert isinstance(output_specs, list)
-                            output_specs[i] = None
+                            new_specs.append(None)
                             continue
                         else:
                             raise ValueError(
@@ -276,7 +280,16 @@ def _wrap_output_spec_tensor_meta(
                                 "does not have an associated TensorMeta"
                             )
 
-                    spec.tensor_meta = output_tensor_meta_i
+                    new_specs.append(
+                        spec.shallow_copy_with_tensor_meta(output_tensor_meta_i)
+                    )
+                else:
+                    new_specs.append(spec)
+
+            return tuple(new_specs)
+        else:
+            assert output_specs is None
+            return output_specs
 
     def _wrap_with_op_strategy(self, op_schema: OpSchema) -> OpSchema:
         """
@@ -503,9 +516,10 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                 raise ValueError("Unsupported op strategy type")
 
             # associate the output sharding with the output tensor metadata
-            self._wrap_output_spec_tensor_meta(
+            new_output_spec = self._create_output_spec_with_new_tensor_meta(
                 op_schema.op, output_sharding.output_spec, out_tensor_meta
             )
+            output_sharding.output_spec = new_output_spec
             return output_sharding
         elif op_schema.op in self.op_to_rules:
             # propagate the sharding with rule
@@ -545,9 +559,10 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                     output_sharding.needs_redistribute = True
 
             # associate the output sharding with the output tensor metadata
-            self._wrap_output_spec_tensor_meta(
+            new_output_spec = self._create_output_spec_with_new_tensor_meta(
                 op_schema.op, output_sharding.output_spec, out_tensor_meta
             )
+            output_sharding.output_spec = new_output_spec
 
             return output_sharding
         else:
diff --git a/torch/distributed/tensor/_utils.py b/torch/distributed/tensor/_utils.py
index a39c49f5230a4..b8f428ae8bd3d 100644
--- a/torch/distributed/tensor/_utils.py
+++ b/torch/distributed/tensor/_utils.py
@@ -123,69 +123,92 @@ def _compute_local_shape_and_global_offset(
     my_coordinate: Optional[list[int]],
     placements: Sequence[Placement],
 ) -> tuple[tuple[int, ...], tuple[int, ...]]:
-    ordered_placements = _explicit_order_placements(mesh_shape, placements)
+    """
+    Suppose you have a full tensor with size global_shape, and you have sharded
+    it according to placements for mesh_shape.  This function returns, for a
+    specific coordinate my_coordinate in the device mesh:
+
+        - The size of your local shard WITHOUT padding (i.e., if you have
+          an uneven split, your size might be smaller than the other entries
+          in your dim), and
+
+        - Where the data for your shard begins, in the full tensor.
+
+    This function is fairly simple if your tensor is evenly sharded; the complication
+    is around uneven splits.  There is also some complication for handling StridedShard,
+    which changes the order you should apply sharding.
+    """
 
     if my_coordinate is None:
         # if rank not in the mesh, return empty offset
         return ((0,), ())
-    else:
-        local_shape = list(global_shape)
-        global_offset = [0] * len(global_shape)
-        for mesh_dim, placement in ordered_placements:
-            mesh_dim_size = mesh_shape[mesh_dim]
-            if isinstance(placement, Shard):
-                shard_dim = placement.dim
-                local_offset = [0] * len(global_shape)
-                assert shard_dim < len(local_shape), (
-                    f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)}"
-                )
-                shard_size, shard_offset = placement._local_shard_size_and_offset(
-                    local_shape[shard_dim],
-                    mesh_dim_size,
-                    my_coordinate[mesh_dim],
-                )
 
-                local_shape[shard_dim] = shard_size
-                local_offset[shard_dim] = shard_offset
-                if shard_size == 0:
-                    # Special case to fill in a standardized non-garbage value for the global_offset
-                    # of zero-sized shards.  This value is out of bounds of the tensor, so it won't conflict
-                    # with any real offsets.  DCP may rely on this value to de-duplicate shards.
-                    global_offset[shard_dim] = global_shape[shard_dim]
-                else:
-                    # On a given dimension, if the local_offset[shard_dim] is smaller than global_offset[shard_dim],
-                    # it means that this dimension has been already sharded in previous placement.
-                    # Therefore, we cannot simply replace the global_offset[shard_dim] with local_offset[shard_dim].
-                    # Instead, for the given shard_dim, we need to add local_offset[shard_dim] to existing global_offset[shard_dim].
-                    if global_offset[shard_dim] <= local_offset[shard_dim]:
-                        global_offset[shard_dim] = local_offset[shard_dim]
-                    else:
-                        global_offset[shard_dim] += local_offset[shard_dim]
-
-        # NOTE: the offset compute relies on the local shard index and it has no
-        # problem when strided sharding is not present. To correctly compute, we assume
-        # that the ``_StridedShard.split_factor`` field encodes how many partitions
-        # each local tensor will be further split into when sharding on higher mesh
-        # dimensions. However, this number is only correct if the DTensor is not
-        # sharded after the strided sharding completes. For example,
-        # [Shard(0), _StridedShard(0, split_factor=2), Shard(0)] is the placements
-        # where the DTensor's dim-0 is first sharded on device mesh dim-0, then on
-        # device mesh dim-2, and last on mesh dim-1. We define the
-        # "_StridedShard(0, split_factor=2), Shard(0)" part as the strided sharding
-        # part because strided sharding happens on mesh dim-1 and it was caused by
-        # the fact that sharding on dim-2 occurred ahead. In this case, there's no
-        # further sharding after this strided sharding part and ``split_factor``
-        # correctly encodes the number. Another example is
-        # [_StridedShard(0, split_factor=2), Shard(0), Shard(0)] where the DTensor's
-        # dim-0 is first sharded on mesh dim-1, then on mesh dim-0, and last on mesh
-        # dim-2. This violates our assumption that no further sharding shall occur
-        # after the strided sharding part and ``split_factor`` won't correctly
-        # encode the number of further split. So far, the only case where _StridedShard
-        # placement would appear is FSDP2 + TP on 2D mesh and the above case could only
-        # happen on mesh of 3 or more dimensions.
-        # TODO: change this function to correctly address this.
-        # TODO: this logic can be applied to contiguous sharding as well
-        return tuple(local_shape), tuple(global_offset)
+    # StridedShard implies a non-standard order to apply shards; get the
+    # correct order to start applying splits
+    ordered_placements = _explicit_order_placements(mesh_shape, placements)
+
+    local_shape = list(global_shape)
+    # We'll compute the data for where the shard beings on a per-dim basis.
+    # However, a single dim can be sharded multiple times, so we will end up
+    # doing a Sum(size*stride) like computation to determine the location of our
+    # shard for each of the shardings on that dim.
+    global_offset = [0] * len(global_shape)
+
+    for mesh_dim, placement in ordered_placements:
+        mesh_dim_size = mesh_shape[mesh_dim]
+        if isinstance(placement, Shard):
+            shard_dim = placement.dim
+            assert shard_dim < len(local_shape), (
+                f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)}"
+            )
+            shard_size, shard_offset = placement._local_shard_size_and_offset(
+                local_shape[shard_dim],
+                mesh_dim_size,
+                my_coordinate[mesh_dim],
+            )
+
+            local_shape[shard_dim] = shard_size
+
+            global_offset[shard_dim] = torch.sym_ite(
+                shard_size == 0,
+                # Special case to fill in a standardized non-garbage value for
+                # the global_offset of zero-sized shards.  This value is out
+                # of bounds of the tensor, so it won't conflict with any real
+                # offsets.  DCP may rely on this value to de-duplicate shards.
+                # Note that you can end up with zero-size shards that are
+                # still otherwise in bounds for the tensor (TODO: give an
+                # example).
+                global_shape[shard_dim],
+                # As we successively shard the same dimension, we keep
+                # advancing our pointer beyond our original offset until we
+                # get to the final chunk start.
+                global_offset[shard_dim] + shard_offset,
+            )
+
+    # NOTE: the offset compute relies on the local shard index and it has no
+    # problem when strided sharding is not present. To correctly compute, we assume
+    # that the ``_StridedShard.split_factor`` field encodes how many partitions
+    # each local tensor will be further split into when sharding on higher mesh
+    # dimensions. However, this number is only correct if the DTensor is not
+    # sharded after the strided sharding completes. For example,
+    # [Shard(0), _StridedShard(0, split_factor=2), Shard(0)] is the placements
+    # where the DTensor's dim-0 is first sharded on device mesh dim-0, then on
+    # device mesh dim-2, and last on mesh dim-1. We define the
+    # "_StridedShard(0, split_factor=2), Shard(0)" part as the strided sharding
+    # part because strided sharding happens on mesh dim-1 and it was caused by
+    # the fact that sharding on dim-2 occurred ahead. In this case, there's no
+    # further sharding after this strided sharding part and ``split_factor``
+    # correctly encodes the number. Another example is
+    # [_StridedShard(0, split_factor=2), Shard(0), Shard(0)] where the DTensor's
+    # dim-0 is first sharded on mesh dim-1, then on mesh dim-0, and last on mesh
+    # dim-2. This violates our assumption that no further sharding shall occur
+    # after the strided sharding part and ``split_factor`` won't correctly
+    # encode the number of further split. So far, the only case where _StridedShard
+    # placement would appear is FSDP2 + TP on 2D mesh and the above case could only
+    # happen on mesh of 3 or more dimensions.
+    # TODO: change this function to correctly address this.
+    # TODO: this logic can be applied to contiguous sharding as well
+    return tuple(local_shape), tuple(global_offset)
 
 
 def compute_global_tensor_info(
diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
index 6cd06727cd2b2..a3345f37a170d 100644
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@@ -2,27 +2,18 @@
 import itertools
 import logging
 import types
-import weakref
 from abc import ABC, abstractmethod
 from collections.abc import Generator
 from dataclasses import dataclass
 from enum import auto, Enum
-from typing import Any, Callable, Optional, Protocol, Union
+from typing import Any, Callable, Optional, Protocol
 
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as ft_c
 import torch.nn.functional as F
-from torch import nn
 from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import (
-    distribute_module,
-    distribute_tensor,
-    DTensor,
-    Replicate,
-    Shard,
-)
-from torch.distributed.tensor.parallel.style import ParallelStyle
+from torch.distributed.tensor import distribute_tensor, DTensor, Shard
 from torch.nn.attention.flex_attention import (
     _mask_mod_signature,
     BlockMask,
@@ -64,7 +55,7 @@ class _ContextParallelOptions:
     # errors. It is likely this is always True but we currently keep this variable
     # for the experimental purpose.
     convert_to_f32: bool = True
-    enable_load_balance = True
+    enable_load_balance: bool = True
     rotate_method: _RotateMethod = _RotateMethod.ALL_GATHER
 
 
@@ -73,10 +64,6 @@ class _ContextParallelOptions:
 
 @dataclass
 class _ContextParallelGlobalVars:
-    # The current context parallel impl requires a record of some info
-    # as global vars. This dataclass stores those variables.
-    # TODO: this var should be able to stored in CP context
-    cp_shard_dim: int = 0
     # This variable stores the TorchFunctionMode singleton because using multiple TF
     # instances for dispatching may trigger recompilations
     torch_function_mode: Optional[TorchFunctionMode] = None
@@ -85,11 +72,6 @@ class _ContextParallelGlobalVars:
 _cp_global_vars = _ContextParallelGlobalVars()
 
 
-def _set_cp_global_var(name: str, value: Any) -> None:
-    """Set a global variable for context parallelism."""
-    setattr(_cp_global_vars, name, value)
-
-
 def _is_causal_behavior(
     rank: int, world_size: int, i: int, is_causal: bool
 ) -> _CausalBehavior:
@@ -152,6 +134,7 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
         self._seq_dim = seq_dim
         self._out: Optional[torch.Tensor] = None
         self._lse: Optional[torch.Tensor] = None
+        self._should_lse_squeeze = False
         self._convert_to_f32 = convert_to_f32
         self._out_dtype = torch.float32
         self._lse_dtype = torch.float32
@@ -159,7 +142,14 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
     def _merge_one(
         self, block_out: torch.Tensor, block_lse: torch.Tensor, partial: bool
     ) -> None:
-        block_lse = block_lse.unsqueeze(dim=-1)
+        # The cuDNN backend preserves the last dimension for LSE.
+        # Apply unsqueeze only if the input does not already have
+        # the required dimensionality.
+        if len(block_lse.shape) < len(block_out.shape):
+            block_lse = block_lse.unsqueeze(dim=-1)
+            self._should_lse_squeeze = True
+        assert len(block_lse.shape) == len(block_out.shape)
+
         if self._lse is None:
             self._lse = block_lse
             self._out = block_out
@@ -217,8 +207,12 @@ def step(self, out: torch.Tensor, lse: torch.Tensor, partial: bool) -> None:
     def results(self) -> tuple[torch.Tensor, torch.Tensor]:
         assert self._out is not None
         assert self._lse is not None
-        out, lse = self._out, self._lse.squeeze(-1)
-        return out.to(self._out_dtype), lse.to(self._lse_dtype)
+        out = self._out.to(self._out_dtype)
+        if self._should_lse_squeeze:
+            lse = self._lse.squeeze(-1).to(self._lse_dtype)
+        else:
+            lse = self._lse.to(self._lse_dtype)
+        return out, lse
 
 
 class _AttentionOp(Protocol):
@@ -369,7 +363,7 @@ def _templated_ring_attention(
     (k0, k3). For rank0, no computation is needed for q0. However, computations for
     q3k1 and q3k2 are required, so only q3 is used for SDPA. This corresponds to the
     `else` of the (`if`, `elif`, `else`) in the implementation.
-    For rank1, k0 is not needed for q1 and q2, so only k3 is used for SDPA. This
+    For rank1, k3 is not needed for q1 and q2, so only k0 is used for SDPA. This
     corresponds to the `elif` of (`if`, `elif`, `else`) in the implementation.
 
     Parameters
@@ -457,7 +451,7 @@ def _templated_ring_attention(
         else:
             # Round-robin load balancing case, and i > rank.
             # We need to do SPDA with only the second half of the q, and update
-            # only the the second part of  logsumexp. So partial is True.
+            # only the second part of  logsumexp. So partial is True.
             # Note that q, k, v, each contains two chunks.
             q, k, v, partial = query.chunk(2, dim=2)[1], key, value, True
 
@@ -942,18 +936,10 @@ def _distribute_function(
     output_fn: Optional[Callable] = None,
 ) -> None:
     """
-    ``distribute_function`` is an experimental API that allows users to "distribute"
-    the inputs and outputs of a function. Similar to ``distribute_module``, this API
-    installs hooks to the ``fn`` to convert the inputs and outputs. There are two
-    major differences between ``distribute_function`` and ``distribute_module``.
-    First, a function does not have parameters and buffers, as a result,
-    ``distribute_function`` itself won't convert any parameters/buffers but simply
-    install the input and output hooks.  The tensor conversion will happen in the hooks.
-    Another difference is an nn.Module subclass can have several instances and each
-    instance be fed into ``distribute_module`` independently with affecting other
-    instance. On the other hand, function is a singleton object. So if a function
-    is distributed by ``distribute_function`` all subsequent calls to the function
-    will invoke the installed hooks.
+    A helper function to replace a function with a distributed version by
+    using the monkey patching approach.
+
+    This function is for the CP internal usage only.
 
     Args:
         fn (Callable): the function to be distributed.
@@ -1004,7 +990,7 @@ def _restore_function(fn: Callable, fn_module: types.ModuleType) -> None:
 
 
 @contextlib.contextmanager
-def _enable_cp_dispatcher() -> Generator[None, None, None]:
+def _enable_cp_dtensor_dispatcher() -> Generator[None, None, None]:
     """Enables DTensor dispatcher to dispatch SDPA to CP."""
     old_handlers = DTensor._op_dispatcher._custom_op_handlers
     DTensor._op_dispatcher._custom_op_handlers = {**old_handlers, **customized_ops}
@@ -1014,196 +1000,10 @@ def _enable_cp_dispatcher() -> Generator[None, None, None]:
     DTensor._op_dispatcher._custom_op_handlers = old_handlers
 
 
-class _AttentionContextParallel(ParallelStyle):
-    """
-    Applies context parallel optimizations to the attention layer.
-
-    This will work for nn.MultiHeadedAttention and custom attention layers that
-    call F.scaled_dotproduct_attention with a similar signature.
-
-    This expects the `forward` method consumes either:
-
-    * a single tensor for self attention
-    * one argument for each of: query, key, value
-
-    This currently only supports ring attention and the
-    SDPBackend.FLASH_ATTENTION backend. See sdpa_kernel.
-
-    Non-flash attention backends will result in incorrect results.
-    """
-
-    # use a weakref dictionary to store context managers for each nn.Module
-    _CONTEXT_MANAGERS: "weakref.WeakKeyDictionary[nn.Module, Any]" = (
-        weakref.WeakKeyDictionary()
-    )
-
-    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
-        if not device_mesh.ndim == 1:
-            raise ValueError("CP only supports single dimension device mesh")
-
-        return distribute_module(
-            module,
-            device_mesh,
-            input_fn=self._input_fn,  # type: ignore[arg-type]
-            output_fn=self._output_fn,  # type: ignore[arg-type]
-        )
-
-    @classmethod
-    def _input_fn(
-        cls,
-        module: nn.Module,
-        inputs: tuple[Union[torch.Tensor, int, float], ...],
-        device_mesh: DeviceMesh,
-    ) -> tuple[Union[torch.Tensor, int, float], ...]:
-        # TODO(d4l3k); this should be Shard(2), need to fix Linear layer rules
-        placement = [Replicate()]
-
-        def backward_hook(grad: torch.Tensor) -> None:
-            if module in cls._CONTEXT_MANAGERS:
-                cls._CONTEXT_MANAGERS[module].__exit__(None, None, None)
-                del cls._CONTEXT_MANAGERS[module]
-
-        # convert inputs to DTensor
-        inp = []
-        for input in inputs:
-            if isinstance(input, torch.Tensor) and not isinstance(input, DTensor):
-                input = DTensor.from_local(
-                    input.contiguous(), device_mesh, placement, run_check=False
-                )
-
-            if isinstance(input, torch.Tensor) and input.requires_grad:
-                input.register_hook(backward_hook)
-
-            inp.append(input)
-
-        manager = _enable_cp_dispatcher()
-        manager.__enter__()
-        cls._CONTEXT_MANAGERS[module] = manager
-
-        return tuple(inp)
-
-    @classmethod
-    def _output_fn(
-        cls,
-        module: nn.Module,
-        outputs: Union[torch.Tensor, tuple[Union[torch.Tensor, int, float], ...]],
-        device_mesh: DeviceMesh,
-    ) -> Union[
-        Union[torch.Tensor, int, float], tuple[Union[torch.Tensor, int, float], ...]
-    ]:
-        cls._CONTEXT_MANAGERS[module].__exit__(None, None, None)
-        del cls._CONTEXT_MANAGERS[module]
-
-        def backward_hook(grad: torch.Tensor) -> None:
-            if module not in cls._CONTEXT_MANAGERS:
-                manager = _enable_cp_dispatcher()
-                manager.__enter__()
-                cls._CONTEXT_MANAGERS[module] = manager
-
-        # back to local tensor
-        out = []
-        for output in [outputs] if isinstance(outputs, torch.Tensor) else outputs:
-            output = output.to_local() if isinstance(output, DTensor) else output
-
-            if isinstance(output, torch.Tensor) and output.requires_grad:
-                output.register_hook(backward_hook)
-
-            out.append(output)
-
-        if isinstance(outputs, torch.Tensor):
-            return out[0]
-
-        return tuple(out)
-
-
-def create_cp_block_mask(
-    mask_mod: _mask_mod_signature,
-    B: int,
-    H: int,
-    Q_LEN: int,
-    KV_LEN: int,
-    device_mesh: DeviceMesh,
-) -> BlockMask:
-    """
-    This API creates a special BlockMask for Context Parallel FlexAttention:
-    1. This BlockMask is masking on the attention of Q shard and KV global views, by
-    mapping the local q_idx to the global q_idx before sending to mask_mod.
-    2. The kv_seq_length (i.e. seq_lengths[1]) of this blockMask is tailored to match
-    the sequence length of KV shard instead of KV global. This is to pass the shape check
-    in flex_atttention(). The correct value (i.e. the sequence length of KV global) will be
-    used in flex_attention once the shape check passes.
-
-    Args:
-        mask_mod (Callable): Function to modify the mask over the global attention result.
-        B (int): Batch size.
-        H (int): Number of query heads.
-        Q_LEN (int): Sequence length of query (global view).
-        KV_LEN (int): Sequence length of key/value (global view).
-        device_mesh (:class:`DeviceMesh`): The device mesh for the context parallelism.
-
-    Return:
-        :class:`BlockMask`: the block_mask to be used in flex_attention() within the
-        context_parallel() context.
-
-    .. warning::
-        This function cannot generate correct block_mask if the BLOCK_SIZE is not
-        ``_DEFAULT_SPARSE_BLOCK_SIZE`` which usually happens when the attention
-        size is smaller than 128. Please do not use context_parallel() when the
-        FlexAttention size is small.
-    """
-    from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE
-
-    compiled_create_block_mask = torch.compile(
-        create_block_mask, dynamic=False, fullgraph=True
-    )
-
-    def _rewrite_mask_mod(
-        mask_mod: _mask_mod_signature,
-        rank: int,
-        world_size: int,
-        block_size: int,
-        local_q_size: int,
-    ) -> _mask_mod_signature:
-        def local_q_idx_to_q_idx(local_q_idx: torch.Tensor) -> torch.Tensor:
-            # calculate local block_idx and block_offset
-            local_blk_idx, local_blk_offset = (
-                local_q_idx // block_size,
-                local_q_idx % block_size,
-            )
-            # NOTE: load balancing is not used
-            local_num_blocks = local_q_size // block_size
-            blk_idx = local_num_blocks * rank + local_blk_idx
-            return blk_idx * block_size + local_blk_offset
-
-        return lambda b, h, q_idx, kv_idx: mask_mod(
-            b,
-            h,
-            local_q_idx_to_q_idx(q_idx),
-            kv_idx,
-        )
-
-    cp_rank = device_mesh.get_local_rank()
-    cp_group_size = device_mesh.size()
-    Q_SHARD_LEN = Q_LEN // cp_group_size
-    block_size = _DEFAULT_SPARSE_BLOCK_SIZE
-    block_mask = compiled_create_block_mask(
-        _rewrite_mask_mod(mask_mod, cp_rank, cp_group_size, block_size, Q_SHARD_LEN),
-        B,
-        H,
-        Q_SHARD_LEN,
-        KV_LEN,
-        device=device_mesh.device_type,
-        BLOCK_SIZE=(block_size, block_size),
-    )
-    # flex_attention function checks the following shape so we need to rewrite:
-    # key.size(-2) == block_mask.seq_lengths[1]
-    seq_lengths = block_mask.seq_lengths
-    block_mask.seq_lengths = (seq_lengths[0], seq_lengths[1] // cp_group_size)
-    return block_mask
-
-
 @contextlib.contextmanager
-def _context_parallel(seq_dim: int, mesh: DeviceMesh) -> Generator[None, None, None]:
+def _context_parallel_dispatcher(
+    seq_dim: int, mesh: DeviceMesh
+) -> Generator[None, None, None]:
     """Replace SDPA with the CP-wrapped version and enable DTensor CP dispatcher."""
 
     def attention_input_fn(
@@ -1233,12 +1033,6 @@ def attention_output_fn(mesh: DeviceMesh, outputs: Any) -> Any:
 
         return tuple(new_outputs)
 
-    def unshard(x: torch.Tensor, mesh: DeviceMesh, shard_dim: int) -> torch.Tensor:
-        x = x.contiguous()
-        all_xs = [torch.empty_like(x) for _ in range(mesh.size())]
-        ft_c.all_gather_inplace(all_xs, x, mesh)
-        return torch.cat(all_xs, dim=shard_dim)
-
     class DistributeFunction(TorchFunctionMode):
         def __init__(
             self,
@@ -1270,10 +1064,10 @@ def __torch_function__(
                 assert isinstance(block_mask, tuple)
 
                 global_key = ft_c.all_gather_tensor_autograd(
-                    key, _cp_global_vars.cp_shard_dim, self._device_mesh
+                    key, seq_dim, self._device_mesh
                 )
                 global_value = ft_c.all_gather_tensor_autograd(
-                    value, _cp_global_vars.cp_shard_dim, self._device_mesh
+                    value, seq_dim, self._device_mesh
                 )
 
                 # shape rewrite: because torch.nn.flex_attention() checks
@@ -1311,7 +1105,7 @@ def __torch_function__(
             attention_input_fn,
             attention_output_fn,
         )
-        with _enable_cp_dispatcher():
+        with _enable_cp_dtensor_dispatcher():
             yield
         _restore_function(F.scaled_dot_product_attention, F)
     elif _dispatch_mode == _DispatchMode.TORCH_FUNCTION:
@@ -1323,10 +1117,10 @@ def __torch_function__(
                 attention_input_fn,
                 attention_output_fn,
             )
-            _set_cp_global_var("torch_function_mode", tf_mode)
+            _cp_global_vars.torch_function_mode = tf_mode
 
         with tf_mode:
-            with _enable_cp_dispatcher():
+            with _enable_cp_dtensor_dispatcher():
                 yield
     else:
         raise NotImplementedError("torch dispatch mode is not supported yet.")
@@ -1396,6 +1190,9 @@ def _context_parallel_buffers(
     return new_buffers
 
 
+#####################################################
+# Current public APIs, but are also subject to change
+#####################################################
 @contextlib.contextmanager
 @torch.no_grad()
 def context_parallel(
@@ -1469,7 +1266,7 @@ def context_parallel(
         buffer.resize_(shard.shape)
         buffer.copy_(shard)
 
-    with _context_parallel(seq_dim=2, mesh=mesh):
+    with _context_parallel_dispatcher(seq_dim=2, mesh=mesh):
         yield
 
     for buffer, original_buffer in zip(buffers, original_buffers):
@@ -1547,3 +1344,89 @@ def set_rotate_method(rotate_method: str) -> None:
             "Context Parallel does not support "
             f"using {rotate_method} for kv shards rotation"
         )
+
+
+def create_cp_block_mask(
+    mask_mod: _mask_mod_signature,
+    B: int,
+    H: int,
+    Q_LEN: int,
+    KV_LEN: int,
+    device_mesh: DeviceMesh,
+) -> BlockMask:
+    """
+    This API creates a special BlockMask for Context Parallel FlexAttention:
+    1. This BlockMask is masking on the attention of Q shard and KV global views, by
+    mapping the local q_idx to the global q_idx before sending to mask_mod.
+    2. The kv_seq_length (i.e. seq_lengths[1]) of this blockMask is tailored to match
+    the sequence length of KV shard instead of KV global. This is to pass the shape check
+    in flex_atttention(). The correct value (i.e. the sequence length of KV global) will be
+    used in flex_attention once the shape check passes.
+
+    Args:
+        mask_mod (Callable): Function to modify the mask over the global attention result.
+        B (int): Batch size.
+        H (int): Number of query heads.
+        Q_LEN (int): Sequence length of query (global view).
+        KV_LEN (int): Sequence length of key/value (global view).
+        device_mesh (:class:`DeviceMesh`): The device mesh for the context parallelism.
+
+    Return:
+        :class:`BlockMask`: the block_mask to be used in flex_attention() within the
+        context_parallel() context.
+
+    .. warning::
+        This function cannot generate correct block_mask if the BLOCK_SIZE is not
+        ``_DEFAULT_SPARSE_BLOCK_SIZE`` which usually happens when the attention
+        size is smaller than 128. Please do not use context_parallel() when the
+        FlexAttention size is small.
+    """
+    from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE
+
+    compiled_create_block_mask = torch.compile(
+        create_block_mask, dynamic=False, fullgraph=True
+    )
+
+    def _rewrite_mask_mod(
+        mask_mod: _mask_mod_signature,
+        rank: int,
+        world_size: int,
+        block_size: int,
+        local_q_size: int,
+    ) -> _mask_mod_signature:
+        def local_q_idx_to_q_idx(local_q_idx: torch.Tensor) -> torch.Tensor:
+            # calculate local block_idx and block_offset
+            local_blk_idx, local_blk_offset = (
+                local_q_idx // block_size,
+                local_q_idx % block_size,
+            )
+            # NOTE: load balancing is not used
+            local_num_blocks = local_q_size // block_size
+            blk_idx = local_num_blocks * rank + local_blk_idx
+            return blk_idx * block_size + local_blk_offset
+
+        return lambda b, h, q_idx, kv_idx: mask_mod(
+            b,
+            h,
+            local_q_idx_to_q_idx(q_idx),
+            kv_idx,
+        )
+
+    cp_rank = device_mesh.get_local_rank()
+    cp_group_size = device_mesh.size()
+    Q_SHARD_LEN = Q_LEN // cp_group_size
+    block_size = _DEFAULT_SPARSE_BLOCK_SIZE
+    block_mask = compiled_create_block_mask(
+        _rewrite_mask_mod(mask_mod, cp_rank, cp_group_size, block_size, Q_SHARD_LEN),
+        B,
+        H,
+        Q_SHARD_LEN,
+        KV_LEN,
+        device=device_mesh.device_type,
+        BLOCK_SIZE=(block_size, block_size),
+    )
+    # flex_attention function checks the following shape so we need to rewrite:
+    # key.size(-2) == block_mask.seq_lengths[1]
+    seq_lengths = block_mask.seq_lengths
+    block_mask.seq_lengths = (seq_lengths[0], seq_lengths[1] // cp_group_size)
+    return block_mask
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
index b37d49bd30744..a8538713a3812 100644
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@@ -32,19 +32,13 @@ class Placement:
 
     # convenient utils to check for placement types
     def is_shard(self, dim: Optional[int] = None) -> bool:
-        is_shard_instance = isinstance(self, Shard)
-        if dim is not None and is_shard_instance:
-            return cast(Shard, self).dim == dim
-        else:
-            return is_shard_instance
+        return False
 
     def is_replicate(self) -> bool:
-        return isinstance(self, Replicate)
+        return False
 
     def is_partial(self, reduce_op: Optional[str] = None) -> bool:
-        if reduce_op is None:
-            return isinstance(self, Partial)
-        return isinstance(self, Partial) and self.reduce_op == reduce_op
+        return False
 
 
 @dataclass(frozen=True)
@@ -68,6 +62,12 @@ class Shard(Placement):
 
     dim: int
 
+    def is_shard(self, dim: Optional[int] = None) -> bool:
+        if dim is not None:
+            return self.dim == dim
+        else:
+            return True
+
     def _split_tensor(
         self,
         tensor: torch.Tensor,
@@ -650,6 +650,9 @@ def _replicate_tensor(
             mesh_broadcast(tensor, mesh, mesh_dim=mesh_dim, group_src=src_data_rank)
         return tensor
 
+    def is_replicate(self) -> bool:
+        return True
+
 
 @dataclass(frozen=True)
 class Partial(Placement):
@@ -729,6 +732,11 @@ def __str__(self) -> str:
         """
         return "P"
 
+    def is_partial(self, reduce_op: Optional[str] = None) -> bool:
+        if reduce_op is None:
+            return True
+        return self.reduce_op == reduce_op
+
 
 # We keep the old _Partial name for a while for BC reason
 _Partial = Partial
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index 76d80ff6eeec8..a299894f306dc 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -2,19 +2,20 @@
 # mypy: allow-untyped-defs
 import dataclasses
 import functools
-import gc
 import inspect
 import logging
-import os
 import re
 import sys
 import time
 import warnings
-import weakref
 from contextlib import contextmanager, nullcontext
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 from typing_extensions import TypeAlias
 
+
+if TYPE_CHECKING:
+    import weakref
+
 import torch
 import torch._dynamo
 import torch.fx
@@ -111,9 +112,6 @@
 
 log = logging.getLogger(__name__)
 
-NONSTRICT_EXPORT_SANITIZE_TRACE = "NONSTRICT_EXPORT_SANITIZE_TRACE"
-
-
 # Type alias for dynamic shapes specification
 _DynamicShapesSpec: TypeAlias = Union[dict[str, Any], tuple[Any, ...], list[Any]]
 
@@ -798,7 +796,12 @@ def _export_to_torch_ir(
         (args, kwargs),
     )
 
-    with torch._dynamo.config.patch(dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)):
+    dynamo_cfg = dataclasses.replace(
+        DEFAULT_EXPORT_DYNAMO_CONFIG,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+    )
+
+    with torch._dynamo.config.patch(dataclasses.asdict(dynamo_cfg)):
         try:
             module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = (
                 _ExportModuleSpecTrackerDict()
@@ -809,20 +812,31 @@ def _export_to_torch_ir(
                     f, preserve_module_call_signature, module_call_specs
                 )
             with ctx, _ignore_backend_decomps():
-                gm_torch_level, _ = torch._dynamo.export(
-                    f,
-                    dynamic_shapes=dynamic_shapes,  # type: ignore[arg-type]
-                    constraints=constraints,  # type: ignore[arg-type]
-                    assume_static_by_default=True,
-                    tracing_mode="symbolic",
-                    disable_constraint_solver=disable_constraint_solver,
-                    prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
-                    _log_export_usage=_log_export_usage,
-                    same_signature=same_signature,
-                )(
-                    *args,
-                    **kwargs,
-                )
+                if torch._export.config.use_new_tracer_experimental:
+                    from torch._dynamo.functional_export import (
+                        _dynamo_graph_capture_for_export,
+                    )
+
+                    gm_torch_level = _dynamo_graph_capture_for_export(
+                        f, constraints=constraints, dynamic_shapes=dynamic_shapes
+                    )(*args, **kwargs)
+
+                else:
+                    gm_torch_level, _ = torch._dynamo.export(
+                        f,
+                        dynamic_shapes=dynamic_shapes,  # type: ignore[arg-type]
+                        constraints=constraints,  # type: ignore[arg-type]
+                        assume_static_by_default=True,
+                        tracing_mode="symbolic",
+                        disable_constraint_solver=disable_constraint_solver,
+                        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+                        _log_export_usage=_log_export_usage,
+                        same_signature=same_signature,
+                    )(
+                        *args,
+                        **kwargs,
+                    )
+                    gm_torch_level.meta["module_call_specs"] = module_call_specs
         except (ConstraintViolationError, ValueRangeError) as e:
             raise UserError(UserErrorType.CONSTRAINT_VIOLATION, str(e))  # noqa: B904
         except GuardOnDataDependentSymNode as e:
@@ -832,8 +846,6 @@ def _export_to_torch_ir(
                 case_name="constrain_as_size_example",
             )
 
-    gm_torch_level.meta["module_call_specs"] = module_call_specs
-
     if isinstance(f, torch.nn.Module) and restore_fqn:
         _restore_state_dict(f, gm_torch_level)
 
@@ -1253,6 +1265,9 @@ def _process_export_inputs(
             f"Expecting `args` to be a tuple of example positional inputs, got {type(args)}",
         )
     kwargs = kwargs if kwargs is not None else {}
+    if pytree.is_namedtuple_instance(args):
+        args = tuple(args)
+
     _, original_in_spec = pytree.tree_flatten((args, kwargs))
 
     verify_additional_inputs: Callable[[ExportedProgram], None]
@@ -2058,15 +2073,10 @@ def _export_for_training(
     # Call the appropriate export function based on the strictness of tracing.
     export_func = _strict_export if strict else _non_strict_export
 
-    alive_fake_input_ids_before_export: list[int] = []
+    if not strict and torch._export.config.detect_non_strict_fake_tensor_leaks:
+        from torch._subclasses.fake_tensor import fake_tensor_tls
 
-    if not strict and os.environ.get(NONSTRICT_EXPORT_SANITIZE_TRACE, "0") == "1":
-        gc.collect()
-        alive_fake_input_ids_before_export = [
-            id(i)
-            for i in gc.get_objects()
-            if isinstance(i, torch._subclasses.fake_tensor.FakeTensor)
-        ]
+        fake_tensor_tls.non_strict_export_fake_tensor_tracker.clear()
 
     export_artifact = export_func(
         mod=mod,
@@ -2124,26 +2134,14 @@ def _export_for_training(
 
     verify_additional_inputs(exported_program)
 
-    if not strict and os.environ.get(NONSTRICT_EXPORT_SANITIZE_TRACE, "0") == "1":
+    if not strict and torch._export.config.detect_non_strict_fake_tensor_leaks:
         # See NOTE [export non-strict fake tensor leak detection]
+        from torch._subclasses.fake_tensor import fake_tensor_tls
         from torch.fx.experimental.proxy_tensor import (
             _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT,
         )
 
-        fakes_after: list[torch._subclasses.fake_tensor.FakeTensor] = [
-            i
-            for i in gc.get_objects()
-            if isinstance(i, torch._subclasses.fake_tensor.FakeTensor)
-        ]
-
-        active_fakes: weakref.WeakSet = weakref.WeakSet()
-        for fake_tensor in fakes_after:
-            if id(fake_tensor) not in alive_fake_input_ids_before_export:
-                active_fakes.add(fake_tensor)
-
-        del fakes_after
-        del alive_fake_input_ids_before_export
-
+        active_fakes = fake_tensor_tls.non_strict_export_fake_tensor_tracker
         legit_leak: weakref.WeakSet = find_legit_leaks_from_referrers(active_fakes)
         leak_sources: list[str] = []
         if len(legit_leak) > 0:
diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py
index f876e462214ca..0cf9b80ff100d 100644
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@@ -51,7 +51,11 @@ def _match_normalized_structure(a, b):
             return True
         if _normalize_type(a.type) != _normalize_type(b.type):
             return False
-        if a.context != b.context:
+        if a.type is dict and b.type is dict:
+            # in the case of dict, the context is list of keys and we allow the keys to be in any order
+            if set(a.context) != set(b.context):
+                return False
+        elif a.context != b.context:
             return False
         if len(a.children_specs) != len(b.children_specs):
             return False
@@ -82,6 +86,64 @@ def _check_inputs_match(args, kwargs, in_spec: pytree.TreeSpec) -> list:
     return flat_args_with_path
 
 
+def _force_ep_signature_match(ep_guards_code: list[str], input_paths):
+    # TODO (tmanlaibaatar)
+    # This is band-aid solution to export new tracer replacing
+    # shape env sources to flat_args. The real fix should be replacing
+    # shape env sources to original user sources but this is quite
+    # involved because you need to carefully construct new sources using
+    # dynamo and replace all instances of it inside shape env. But it is
+    # lot easier to manipulate after we turn them into strings and only
+    # time we use these guards is during retracing or running exported program,
+    # so it is probably ok to have "not useful" guards on ep for now.
+    name_mapping = {}
+    for idx, path in enumerate(input_paths):
+        name_mapping[f"L['flat_args'][{idx}]"] = f"L{pytree.keystr(path)}"
+
+    new_guards_code = []
+    for guard in ep_guards_code:
+        for old_name, new_name in name_mapping.items():
+            guard = guard.replace(old_name, new_name)
+        new_guards_code.append(guard)
+
+    return new_guards_code
+
+
+def _force_gm_signature_match(ep_guards_code: list[str], signature):
+    """
+    The signature of the originally exported module may not match
+    the signature of the unlifted graph module extracted from the
+    exported program. The guards code extracted from the exported
+    program is based on the former, but the generated guards fn is
+    based on the latter; thus we need to reconcile any such diff.
+    """
+
+    import re
+
+    # Handle case where signatures may differ in var args.
+    orig_arg_names = set()
+    for g in ep_guards_code:
+        # match substrings of the form L['<name>'][<number>]
+        orig_arg_names.update(re.findall(r"L\[\'([^\']+)\'\]\[([0-9]+)\]", g))
+
+    sig_arg_names = set()
+    for n in signature.parameters:
+        # match substrings of the form <name>_<number>
+        sig_arg_names.update(re.findall(r"(.+)_([0-9]+)", n))
+
+    # replace L['<name>'][<number>] with L['<name>_<number>']
+    new_guards_code = ep_guards_code
+    for match in orig_arg_names:
+        if match in sig_arg_names:
+            base, idx = match
+            new_guards_code = [
+                g.replace(f"L['{base}'][{idx}]", f"L['{base}_{idx}']")
+                for g in new_guards_code
+            ]
+
+    return new_guards_code
+
+
 def _convert_guards_code_to_fn(
     guards_code: list[str],
     paths_of_placeholders: list[pytree.KeyPath],
@@ -524,11 +586,28 @@ def _get_input_paths(example_inputs, signature):
     """
 
     args, kwargs = example_inputs
-    ctx = signature.bind(*args, **kwargs).arguments
+    binded = signature.bind(*args, **kwargs)
+    binded.apply_defaults()
+    ctx = binded.arguments
     flat_example_inputs_with_paths = pytree.tree_leaves_with_path(ctx)
     return [path for path, _ in flat_example_inputs_with_paths]
 
 
+def _replace_sources(result_str: str, flat_input_paths: list[Any]):
+    """
+    Given user specified input paths, maybe fix up the guard string
+    to reflect user path instead of tracer path.
+    """
+    name_mapping = {}
+    for idx, path in enumerate(flat_input_paths):
+        name_mapping[f"L['flat_args'][{idx}]"] = f"L{pytree.keystr(path)}"
+
+    replace = result_str
+    for key, val in name_mapping.items():
+        replace = replace.replace(key, val)
+    return replace
+
+
 def _get_input_guards_for_graph(
     placeholders: list[torch.fx.Node],
     range_constraints: dict[sympy.Symbol, ValueRanges],
@@ -640,20 +719,31 @@ def handle_symint(expr, src):
     return new_guards_code
 
 
-def _unlift_exported_program_lifted_states(
-    ep: ExportedProgram, check_guards=True
-) -> torch.fx.GraphModule:
-    # force check_guards=False for executorch because
-    # its pass infra has too many calls to .module()
-    # and but does not like call modules in the graph
-    # TODO: update executorch to check_guards=False
+def _ok_to_generate_guards_fn():
+    patterns = [
+        "executorch",
+        "modai",
+        "on_device_ai",
+        "torchao",
+    ]
+    # force check_guards=False for files matching `patterns`
+    # because they have too many calls to .module() and
+    # do not like any call modules in the graph
+    # TODO: fix these files to handle guard fns
     frame = inspect.currentframe()
     while frame is not None:
-        if "executorch" in frame.f_code.co_filename:
-            check_guards = False
-            break
+        if any(path in frame.f_code.co_filename for path in patterns):
+            return False
         frame = frame.f_back
 
+    return True
+
+
+def _unlift_exported_program_lifted_states(
+    ep: ExportedProgram, check_guards=True
+) -> torch.fx.GraphModule:
+    check_guards = check_guards and _ok_to_generate_guards_fn()
+
     # TODO T206340015
     if ep.verifiers[0].dialect != "TRAINING":
         ep = _remove_effect_tokens(ep)
@@ -729,14 +819,32 @@ def _unlift_exported_program_lifted_states(
     graph = unlift_gm.graph
     placeholders = graph.find_nodes(op="placeholder")
     if check_guards and placeholders and ep.example_inputs:
+        sig = inspect.signature(unlift_gm.forward)
         input_paths = _get_input_paths(
             ep.example_inputs,
-            inspect.signature(unlift_gm.forward),
+            sig,
         )
+
+        # TODO (tmanlaibaatar)
+        # This is band-aid solution to export new tracer replacing
+        # shape env sources to flat_args. The real fix should be replacing
+        # shape env sources to original user sources but this is quite
+        # involved because you need to carefully construct new sources using
+        # dynamo and replace all instances of it inside shape env. But it is
+        # lot easier to manipulate after we turn them into strings and only
+        # time we use these guards is during retracing or running exported program,
+        # so it is probably ok to have "not useful" guards on ep for now.
+        ep_guards = []
+        for guard in ep._guards_code:
+            ep_guards.append(_replace_sources(guard, input_paths))
+
         guards_code = _get_input_guards_for_graph(
             placeholders, ep.range_constraints, input_paths
         )
-        guards_code.extend(ep._guards_code)
+
+        ep_guards_code = _force_ep_signature_match(ep._guards_code, input_paths)
+        ep_guards_code = _force_gm_signature_match(ep_guards_code, sig)
+        guards_code.extend(ep_guards_code)
         unlift_gm._guards_fn = _convert_guards_code_to_fn(guards_code, input_paths)
 
         root_nn_module_stack = torch.fx._utils.first_call_function_nn_module_stack(
diff --git a/torch/export/custom_ops.py b/torch/export/custom_ops.py
index 57288fa344c10..9df7988da9314 100644
--- a/torch/export/custom_ops.py
+++ b/torch/export/custom_ops.py
@@ -1,3 +1,6 @@
+# mypy: allow-untyped-defs
+import importlib
+
 import torch
 
 
@@ -24,3 +27,23 @@ def _access_subclass_inner_tensor(
             f"Attribute {attr} is not a tensor or doesn't exist in {src_subclass_tensor}"
         )
     return val
+
+
+def _call_custom_autograd_function_in_pre_dispatch(function_cls_name, *args, **kwargs):
+    """
+    Import a custom autograd function by string name and call it. This is pretty bad
+    because:
+    1) There is no schema
+
+    Ideally we should automatically wrap custom autograd functions with a custom op, but
+    that is too much work because we need to schematize custom autograd functions. For now,
+    we just hackily put it in the IR.
+    """
+    # Parse module and class name
+    module_name, class_name = function_cls_name.rsplit(".", 1)
+
+    # Import the module and get the class
+    module = importlib.import_module(module_name)
+    function_cls = getattr(module, class_name)
+    assert hasattr(function_cls, "apply")
+    return function_cls.apply(*args, **kwargs)
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index 1aa2e59d1752b..807321f0a1eb7 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -1086,7 +1086,7 @@ def __init__(
         # Validate should be always the last step of the constructor.
         self.validate()
 
-        self._guards_code = _convert_guards_to_code(_get_shape_env(self._graph_module))
+        self._guards_code = _convert_guards_to_code(self._graph_module)
 
     @property
     @compatibility(is_backward_compatible=False)
@@ -1690,7 +1690,8 @@ def _create_graph_module_for_export(root, graph):
     return gm
 
 
-def _convert_guards_to_code(shape_env):
+def _convert_guards_to_code(graph_module):
+    shape_env = _get_shape_env(graph_module)
     if shape_env is None:
         return []
 
diff --git a/torch/export/pt2_archive/_package.py b/torch/export/pt2_archive/_package.py
index db147e2fb8094..f8849282dd9a2 100644
--- a/torch/export/pt2_archive/_package.py
+++ b/torch/export/pt2_archive/_package.py
@@ -43,6 +43,7 @@
     CONSTANTS_CONFIG_FILENAME_FORMAT,
     CONSTANTS_DIR,
     CUSTOM_OBJ_FILENAME_PREFIX,
+    EXECUTORCH_DIR,
     EXTRA_DIR,
     MODELS_DIR,
     MODELS_FILENAME_FORMAT,
@@ -345,7 +346,7 @@ def _get_raw_tensor_bytes(value: torch.Tensor) -> bytes:
     if _is_fake_tensor(value):
         value_bytes = b""
     elif value.data_ptr():
-        cpu_tensor = value.cpu().contiguous()
+        cpu_tensor = value.cpu()
         value_untyped_storage = cpu_tensor.untyped_storage()
         # we store the raw bytes the untyped storage. Tensor metadata is stored separately
         value_bytes = bytes(
@@ -529,6 +530,16 @@ def _package_extra_files(
         archive_writer.write_string(f"{EXTRA_DIR}{extra_file_name}", content)
 
 
+def _package_executorch_files(
+    archive_writer: PT2ArchiveWriter, executorch_files: Optional[dict[str, bytes]]
+) -> None:
+    if executorch_files is None:
+        return
+
+    for file_name, content in executorch_files.items():
+        archive_writer.write_bytes(f"{EXECUTORCH_DIR}{file_name}", content)
+
+
 def package_pt2(
     f: FileLike,
     *,
@@ -539,6 +550,7 @@ def package_pt2(
     extra_files: Optional[dict[str, Any]] = None,
     opset_version: Optional[dict[str, int]] = None,
     pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+    executorch_files: Optional[dict[str, bytes]] = None,
 ) -> FileLike:
     r"""
     Saves the artifacts to a PT2Archive format. The artifact can then be loaded
@@ -569,6 +581,9 @@ def package_pt2(
 
         pickle_protocol: can be specified to override the default protocol
 
+        executorch_files (Optional[dict[str, bytes]]): Optional executorch
+         artifacts to save.
+
     """
     assert not (
         exported_programs is None and aoti_files is None and extra_files is None
@@ -602,6 +617,7 @@ def package_pt2(
             pickle_protocol=pickle_protocol,
         )
         _package_extra_files(archive_writer, extra_files)
+        _package_executorch_files(archive_writer, executorch_files)
 
     if isinstance(f, (io.IOBase, IO)):
         f.seek(0)
diff --git a/torch/export/pt2_archive/constants.py b/torch/export/pt2_archive/constants.py
index 772c3c0708412..4b05e257b8f3d 100644
--- a/torch/export/pt2_archive/constants.py
+++ b/torch/export/pt2_archive/constants.py
@@ -13,6 +13,7 @@
     pt2_archive_constants.CONSTANTS_CONFIG_FILENAME_FORMAT
 )
 CUSTOM_OBJ_FILENAME_PREFIX: str = pt2_archive_constants.CUSTOM_OBJ_FILENAME_PREFIX
+EXECUTORCH_DIR: str = pt2_archive_constants.EXECUTORCH_DIR
 EXTRA_DIR: str = pt2_archive_constants.EXTRA_DIR
 MODELS_DIR: str = pt2_archive_constants.MODELS_DIR
 MODELS_FILENAME_FORMAT: str = pt2_archive_constants.MODELS_FILENAME_FORMAT
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index ae4d1c59823a2..d23eee30a898e 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -798,23 +798,29 @@ def _maybe_record_pointwise_barrier(
     func: object, proxy_mode: ProxyTorchDispatchMode
 ) -> None:
     """
-    Records pointwise operators in user program (non decomposed) that were output in fp16/bf16
+    Records operators whose tensor outputs or inputs are fp16/bf16 so downstream pointwise code can
+    emulate eager's rounding behavior when emulate_precision_casts is enabled.
     """
     if proxy_mode.decomp_layers or not proxy_mode.emulate_precision_casts:
         return
 
-    if (
-        not isinstance(func, torch._ops.OpOverload)
-        or torch.Tag.pointwise not in func.tags
-    ):
+    if not isinstance(func, torch._ops.OpOverload):
         return
 
     last_node = next(iter(reversed(proxy_mode.tracer.graph.nodes)))
     t = last_node.meta.get("val")
-    if not isinstance(t, torch.Tensor) or t.dtype not in (
-        torch.bfloat16,
-        torch.float16,
-    ):
+    low_pr_fp = (torch.bfloat16, torch.float16)
+
+    output_low_precision = isinstance(t, torch.Tensor) and t.dtype in low_pr_fp
+
+    if not output_low_precision:
+        for input_node in last_node.all_input_nodes:
+            val = input_node.meta.get("val") if hasattr(input_node, "meta") else None
+            if isinstance(val, torch.Tensor) and val.dtype in low_pr_fp:
+                output_low_precision = True
+                break
+
+    if not output_low_precision:
         return
 
     last_node.meta["low_precision_pointwise_barrier"] = True
@@ -1911,14 +1917,12 @@ def trace(  # type: ignore[override]
         # In non-strict export, we don't have dynamo's side effect
         # tracking logic which makes some cases hard to detect.
         # In general, our detecting strategy is:
-        #  (1) We do gc.collect() before export and get the alive fake tensors
-        #  (2) We dump the proxy to fake tensor map from make_fx tracer (_FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT)
-        #  (3) We query gc again to get alive fake tensors
-        #  (4) We take the delta between (1) and (3)
-        #  (5) Filter out fake tensors that are:
+        #  (1) We instrument fake tensor creation to log all the fake tensors created during export.
+        #  (2) We dump the proxy to fake tensor map from make_fx tracer (_FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT))
+        #  (3) Filter out fake tensors that are logged during (1):
         #      (1) Associated with TrackedFake (input tracking thing in symbolic_shapes)
         #      (2) Associated with gm.meta
-        #  (6) Do ID match with the proxies
+        #  (4) Do ID match with the proxies
 
         global _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT
         _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT.clear()
@@ -2020,7 +2024,7 @@ def create_node(self, *args: object, **kwargs: object) -> fx.node.Node:
 
         # nn_module_stack
         if node.op not in ["placeholder", "output"]:
-            if "nn_module_stack" not in node.meta:
+            if node.meta.get("nn_module_stack") is None:
                 node.meta["nn_module_stack"] = self.module_stack.copy()
             # convert nn_module_stack from Dict[key, (FQN, class)] -> Dict[str, Tuple[str, str]]
             for key, (fqn, mod_cls) in node.meta["nn_module_stack"].items():
diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py
index 5468191163ab7..b6c19b9ddeb98 100644
--- a/torch/fx/experimental/sym_node.py
+++ b/torch/fx/experimental/sym_node.py
@@ -49,7 +49,7 @@
 sym_node_log = torch._logging.getArtifactLogger(__name__, "sym_node")
 
 
-__all__ = ["SymNode", "method_to_operator", "magic_methods"]
+__all__ = ["SymNode", "method_to_operator", "magic_methods", "DynamicInt"]
 
 
 from torch.types import py_sym_types as SymTypes
@@ -625,6 +625,40 @@ def is_constant(self):
         return False
 
 
+class _DynamicScalar:
+    def __new__(cls, *args):
+        if cls is _DynamicScalar:
+            raise TypeError("_DynamicScalar is an abstract base class, use DynamicInt.")
+        return super().__new__(cls, *args)
+
+
+class DynamicInt(_DynamicScalar, int):
+    """
+    User API for marking dynamic integers in `torch.compile`.
+    Intended to be compatible with both compile and eager mode.
+
+    Example usage::
+
+        fn = torch.compile(f)
+        x = DynamicInt(4)
+        fn(x)  # compiles x as a dynamic integer input; returns f(4)
+    """
+
+    def __new__(cls, val):
+        assert isinstance(val, int)
+        obj = super().__new__(cls, int(val))
+        return obj
+
+    def __repr__(self):
+        return f"DynamicInt({self.real})"
+
+    def __floordiv__(self, other):  # // was casting to int without these overrides?
+        return DynamicInt(self.real // other)
+
+    def __rfloordiv__(self, other):
+        return DynamicInt(other // self.real)
+
+
 # TODO: this probably needs the sizes-strides eval functions
 METHOD_TO_OPERATOR = {
     "pos": operator.pos,
@@ -1650,7 +1684,6 @@ def sizes_strides_user(sizes, strides):
 def _make_user_magic(method, user_type):
     # User magic takes care of wrapping the other operand into a node,
     # so that our internal logic can assume everything is nodes
-
     if method in magic_methods_on_operator_with_trailing_underscore:
         method_attr = f"sym_{method}"
     else:
@@ -1781,7 +1814,7 @@ def rbinary_magic_impl(self, other):
         other = promote(other)
         self, other = promote2(self, other)
         if is_constant(self):
-            return (method_to_operator(method))(get_constant(self), other)
+            return (method_to_operator(method))(other, get_constant(self))
         if is_constant(other):
             other = get_constant(other)
         other_node = to_node(self.node, other)
@@ -1790,11 +1823,31 @@ def rbinary_magic_impl(self, other):
         ret = wrap_node(getattr(other_node, method_attr)(self.node))
         return get_constant(ret) if is_constant(ret) else ret
 
+    def setattrs(user_type, attr, symnode_impl):
+        """
+        Registers the SymNode magic method on SymInt/Float/Bool,
+        and optionally registers a corresponding wrapped method on DynamicInt.
+        """
+
+        # SymInt/Float/Bool
+        setattr(user_type, attr, symnode_impl)
+
+        # DynamicInt impl
+        def dynamic_int_impl(*args):
+            args = [x.real if isinstance(x, DynamicInt) else x for x in args]
+            out = getattr(int, attr)(*args)
+            if isinstance(out, int) and not isinstance(out, bool):
+                return DynamicInt(out)
+            return out
+
+        if user_type is SymInt:
+            setattr(DynamicInt, attr, dynamic_int_impl)
+
     if method in unary_magic_methods:
-        setattr(user_type, f"__{method}__", unary_magic_impl)
+        setattrs(user_type, f"__{method}__", unary_magic_impl)
     elif method in unary_nonmagic_methods:
         orig = getattr(user_type, method)
-        setattr(user_type, method, update_wrapper(unary_magic_impl, orig))
+        setattrs(user_type, method, update_wrapper(unary_magic_impl, orig))
     elif method == "sym_ite":
 
         def sym_ite_magic_impl(pred, then_val, else_val):
@@ -1811,7 +1864,7 @@ def sym_ite_magic_impl(pred, then_val, else_val):
             ret = wrap_node(getattr(pred.node, method_attr)(then_node, else_node))
             return get_constant(ret) if ret.node.is_constant() else ret
 
-        setattr(user_type, f"__{method}__", sym_ite_magic_impl)
+        setattrs(user_type, f"__{method}__", sym_ite_magic_impl)
     elif method == "round":
 
         def round_magic_impl(self, ndigits=None):
@@ -1820,14 +1873,14 @@ def round_magic_impl(self, ndigits=None):
 
             return wrap_node(getattr(self.node, method)(ndigits))
 
-        setattr(user_type, f"__{method}__", round_magic_impl)
+        setattrs(user_type, f"__{method}__", round_magic_impl)
     else:
         method_name = method
         if method in bitwise_ops:
             method_name = bitwise_ops[method]
-        setattr(user_type, f"__{method_name}__", binary_magic_impl)
+        setattrs(user_type, f"__{method_name}__", binary_magic_impl)
         if method in reflectable_magic_methods:
-            setattr(user_type, f"__r{method_name}__", rbinary_magic_impl)
+            setattrs(user_type, f"__r{method_name}__", rbinary_magic_impl)
 
 
 for method, func in magic_methods.items():  # type: ignore[assignment]
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index b5758fdfa24d1..850f6b8b5d2b7 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -594,7 +594,7 @@ def rebind_unbacked(
             # exist in the ShapeEnv but are never bound anywhere.  You might
             # like an invariant that unbacked symbols never get lost.  But
             # we do not have this invariant, so do not try to enforce it.
-            if isinstance(u1, int):
+            if isinstance(u1, (int, float)):
                 log.info(
                     "rebind_unbacked: discard %s %s %s -> %s",
                     n.target,
@@ -1756,12 +1756,14 @@ def fx_placeholder_targets(gm: torch.fx.GraphModule) -> list[str]:
 def eval_guards(
     gm: torch.fx.GraphModule, *args: Tensor, ignore_static: bool = True
 ) -> bool:
+    assert gm.shape_env is not None
     return gm.shape_env.evaluate_guards_for_args(  # type: ignore[operator, union-attr]
         fx_placeholder_vals(gm), args, ignore_static=ignore_static
     )
 
 
 def bind_symbols(gm: torch.fx.GraphModule, *args: Tensor) -> dict[sympy.Symbol, int]:
+    assert gm.shape_env is not None
     return gm.shape_env.bind_symbols(fx_placeholder_vals(gm), args)  # type: ignore[operator, union-attr]
 
 
@@ -1943,7 +1945,7 @@ def __post_init__(self) -> None:
         for source, root, fn in self.derived_equalities:
             # preprocess into a transitively-closed map
             # NOTE(avik): we reuse the union-find forest for canonicalizing input sources
-            if isinstance(root, sympy.Symbol):
+            if isinstance(root, (sympy.Symbol, sympy.Integer)):
                 self._defs[self._find(source)] = fn(root)
             else:
                 self._defs[self._find(source)] = fn(self._rewrite(root))
@@ -3718,6 +3720,7 @@ def _init(
         self.source_name_to_debug_name: dict[str, str] = {}
         self.var_to_sources: dict[sympy.Symbol, list[Source]] = {}
         self.var_to_stack: dict[sympy.Symbol, CapturedTraceback] = {}
+        self.var_to_hint_override: dict[sympy.Symbol, int] = {}
         # Maps a source to the *original* symbol that was assigned to it
         self.source_to_var: dict[str, sympy.Symbol] = {}
         # Maps from sympy ints to expressions representing them
@@ -4582,6 +4585,11 @@ def _create_symbolic_sizes_strides_storage_offset(
             )
             for i, (sym, hint) in enumerate(zip(size, ex_size))
         ]
+
+        for i, sym in enumerate(sym_sizes):
+            if isinstance(sym, torch.SymInt) and i in hint_overrides:
+                self.var_to_hint_override[sym.node.expr] = hint_overrides[i]
+
         sym_stride = []
         for i, stride_expr in enumerate(stride):
             # NB: Don't duck size the stride; instead use the expression
@@ -5427,11 +5435,12 @@ def get_expression(tensor_dim_src: Source) -> sympy.Expr:
             for srcEq, root, fn in equalities_inputs.derived_equalities:
                 expr1 = get_expression(srcEq)
                 # recall that root is either a phantom symbol or an input source
-                expr2, debug_name = (
-                    (root, self.var_to_sources[root][0].name())
-                    if isinstance(root, sympy.Symbol)
-                    else (get_expression(root), self._debug_name(root))
-                )
+                if isinstance(root, sympy.Symbol):
+                    expr2, debug_name = root, self.var_to_sources[root][0].name()
+                elif isinstance(root, sympy.Integer):
+                    expr2, debug_name = root, str(root)
+                else:
+                    expr2, debug_name = get_expression(root), self._debug_name(root)
                 expr2_ = fn(expr2)
                 # Check whether given input shape values satisfy a specified equation s = fn(s').
                 # - Raise when the equation was violated by the given input shape values.
@@ -5446,10 +5455,11 @@ def get_expression(tensor_dim_src: Source) -> sympy.Expr:
                     )
 
             for phantom_symbol in equalities_inputs.phantom_symbols:
-                # we created additional phantom symbols that are not input shape dimensions
-                symbol_to_source[phantom_symbol].extend(
-                    self.var_to_sources[phantom_symbol]
-                )
+                if isinstance(phantom_symbol, sympy.Symbol):
+                    # we created additional phantom symbols that are not input shape dimensions
+                    symbol_to_source[phantom_symbol].extend(
+                        self.var_to_sources[phantom_symbol]
+                    )
 
         # How do we know what the value of s0 is?  Fresh variables can only be
         # bound by inputs, so there MUST be some other input which binds the
@@ -6393,7 +6403,10 @@ def simplify(self, expr: _SympyT, size_oblivious: bool = False) -> _SympyT:
                 if isinstance(atom.args[0], IntTrueDiv):
                     base, divisor = atom.args[0].args
                     if base % divisor == 0:
-                        trunc_replacements[atom] = base // divisor
+                        trunc_replacements[atom] = CleanDiv(base, divisor)
+                    else:
+                        # TruncToInt(IntTrueDiv(a,b)) == FloorDiv(a, b)
+                        trunc_replacements[atom] = FloorDiv(base, divisor)
             if trunc_replacements:
                 expr = expr.xreplace(trunc_replacements)
 
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 514490513cbf5..788fb442323c9 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -500,19 +500,10 @@ def type_repr(o: Any):
 
                 origin_typename = add_global(_type_repr(origin_type), origin_type)
 
-                if hasattr(o, "__args__"):
-                    # Assign global names for each of the inner type variables.
+                if hasattr(o, "__args__") and o.__args__:
                     args = [type_repr(arg) for arg in o.__args__]
-
-                    if len(args) == 0:
-                        # Bare type, such as `typing.Tuple` with no subscript
-                        # This code-path used in Python < 3.9
-                        return origin_typename
-
                     return f"{origin_typename}[{','.join(args)}]"
                 else:
-                    # Bare type, such as `typing.Tuple` with no subscript
-                    # This code-path used in Python 3.9+
                     return origin_typename
 
             # Common case: this is a regular module name like 'foo.bar.baz'
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 4c067c0e76e4c..f4496338fffc6 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -546,6 +546,7 @@ def __init__(
         self._erase_node_hooks: list[Callable] = []
         # Used to remove hooks from deepcopied graph modules within a context manager.
         self._deepcopy_hooks: list[Callable] = []
+        self.shape_env = None  # optional not always set even when dynamic shapes exist.
 
     # TorchScript breaks trying to compile the graph setter because of the
     # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842
diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index 8a23c73785e8c..6cf708a619069 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -952,13 +952,22 @@ def starter_nodes(self) -> tuple[NodeSet, NodeSet]:
         starter_cpu_nodes: NodeSet = set()
         starter_acc_nodes: NodeSet = set()
         for node in self.module.graph.nodes:
+            # edge case, call_function, but with no dependencies
+            if node.op == "call_function" and len(node.all_input_nodes) == 0:
+                if node in self.acc_nodes:
+                    starter_acc_nodes.add(node)
+                else:
+                    starter_cpu_nodes.add(node)
+
             if node.op not in {"placeholder", "get_attr"}:
                 continue
+
             for user in node.users:
                 if user in self.acc_nodes:
                     starter_acc_nodes.add(user)
                 else:
                     starter_cpu_nodes.add(user)
+
         return starter_cpu_nodes, starter_acc_nodes
 
     def put_nodes_into_subgraphs(self) -> list[Subgraph]:
diff --git a/torch/headeronly/macros/Macros.h b/torch/headeronly/macros/Macros.h
index 558edb175ae29..e340e7626a094 100644
--- a/torch/headeronly/macros/Macros.h
+++ b/torch/headeronly/macros/Macros.h
@@ -359,6 +359,7 @@ static inline int C10_WARP_SIZE_INTERNAL() {
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)
 #define SYCL_KERNEL_ASSERT(cond)
 #elif defined(_MSC_VER)
 #if defined(NDEBUG)
@@ -396,6 +397,26 @@ __host__ __device__
                static_cast<unsigned>(__LINE__)), \
            0);                                   \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                     \
+  if (C10_UNLIKELY(!(cond))) {                                        \
+    (void)(printf(                                                    \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(           \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: " \
+                      "Assertion failed: `" #cond "`: " msg "\n",     \
+        __func__,                                                     \
+        blockIdx.x,                                                   \
+        blockIdx.y,                                                   \
+        blockIdx.z,                                                   \
+        threadIdx.x,                                                  \
+        threadIdx.y,                                                  \
+        threadIdx.z,                                                  \
+        ##__VA_ARGS__));                                              \
+    (void)(_wassert(                                                  \
+               _CRT_WIDE(#cond),                                      \
+               _CRT_WIDE(__FILE__),                                   \
+               static_cast<unsigned>(__LINE__)),                      \
+           0);                                                        \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                 \
   if (C10_UNLIKELY(!(cond))) {                   \
     (void)(_wassert(                             \
@@ -455,6 +476,10 @@ __host__ __device__
   if C10_UNLIKELY (!(cond)) {             \
     abort();                              \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...) \
+  if C10_UNLIKELY (!(cond)) {                     \
+    abort();                                      \
+  }
 #define SYCL_KERNEL_ASSERT(cond) \
   if C10_UNLIKELY (!(cond)) {    \
     abort();                     \
@@ -470,6 +495,23 @@ __host__ __device__
     __assert_fail(                                                     \
         msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                        \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    printf(                                                            \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(            \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: "  \
+            "Assertion failed: `" #cond "`: " msg "\n",                \
+        __func__,                                                      \
+        blockIdx.x,                                                    \
+        blockIdx.y,                                                    \
+        blockIdx.z,                                                    \
+        threadIdx.x,                                                   \
+        threadIdx.y,                                                   \
+        threadIdx.z,                                                   \
+        ##__VA_ARGS__); \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                                         \
   if (C10_UNLIKELY(!(cond))) {                                           \
     __assert_fail(                                                       \
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index e89bcc47dff6b..aa213dcad35fa 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -2,7 +2,6 @@
 import collections
 import functools
 import inspect
-import sys
 import textwrap
 import types
 import warnings
@@ -158,8 +157,6 @@ def __init__(self, source, filename, file_lineno, leading_whitespace_len):
 
 
 def get_annotations(obj):
-    if sys.version_info < (3, 10):
-        return getattr(obj, "__annotations__", {})
     # In Python-3.10+ it is recommended to use inspect.get_annotations
     # See https://docs.python.org/3.10/howto/annotations.html
     # But also, in 3.10 annotations from base class are not inherited
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index ccd967d69f4e7..4c06ed2407826 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -15,6 +15,7 @@
 import pickle
 import warnings
 from typing import Any, Callable, Union
+from typing_extensions import deprecated
 
 import torch
 import torch._jit_internal as _jit_internal
@@ -750,6 +751,10 @@ def save(self, f, **kwargs):
             """
             return self._c.save(str(f), **kwargs)
 
+        @deprecated(
+            "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
+            https://docs.pytorch.org/executorch/stable/getting-started.html"
+        )
         def _save_for_lite_interpreter(self, *args, **kwargs):
             r"""Add (or update) the bytecode session to the script model.
 
@@ -763,9 +768,23 @@ def _save_for_lite_interpreter(self, *args, **kwargs):
                 _extra_files: Map from filename to contents which will be stored as part of 'f'.
 
             """
+            warnings.warn(
+                "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
+                https://docs.pytorch.org/executorch/stable/getting-started.html",
+                DeprecationWarning,
+            )
             return self._c._save_for_mobile(*args, **kwargs)
 
+        @deprecated(
+            "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
+            https://docs.pytorch.org/executorch/stable/getting-started.html"
+        )
         def _save_to_buffer_for_lite_interpreter(self, *args, **kwargs):
+            warnings.warn(
+                "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
+                https://docs.pytorch.org/executorch/stable/getting-started.html",
+                DeprecationWarning,
+            )
             return self._c._save_to_buffer_for_mobile(*args, **kwargs)
 
         def save_to_buffer(self, *args, **kwargs):
diff --git a/torch/library.py b/torch/library.py
index 372037f09dbe5..57e64c93924c7 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -7,16 +7,7 @@
 import traceback
 import weakref
 from collections.abc import Sequence
-from typing import (
-    Any,
-    Callable,
-    Literal,
-    Optional,
-    overload,
-    TYPE_CHECKING,
-    TypeVar,
-    Union,
-)
+from typing import Any, Callable, Optional, overload, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import deprecated, ParamSpec
 
 import torch
@@ -561,7 +552,7 @@ def wrap(f):
 def impl(
     qualname: str,
     types: Union[str, Sequence[str]],
-    func: Literal[None] = None,
+    func: None = None,
     *,
     lib: Optional[Library] = None,
 ) -> Callable[[Callable[..., object]], None]: ...
@@ -673,7 +664,7 @@ def wrap(f: Callable[_P, _T]) -> Callable[_P, _T]:
 def _impl(
     qualname: str,
     types: Union[str, Sequence[str]],
-    func: Literal[None] = None,
+    func: None = None,
     *,
     lib: Optional[Library] = None,
     disable_dynamo: bool = False,
diff --git a/torch/mtia/__init__.py b/torch/mtia/__init__.py
index 4c4ee32024732..14871d4259696 100644
--- a/torch/mtia/__init__.py
+++ b/torch/mtia/__init__.py
@@ -205,6 +205,7 @@ def attach_out_of_memory_observer(
 
 
 def is_bf16_supported(including_emulation: bool = True):
+    r"""Return a bool indicating if the current MTIA device supports dtype bfloat16."""
     return True
 
 
diff --git a/torch/nativert/ModelRunner.cpp b/torch/nativert/ModelRunner.cpp
index a7688860561e7..3baac49bfb220 100644
--- a/torch/nativert/ModelRunner.cpp
+++ b/torch/nativert/ModelRunner.cpp
@@ -85,6 +85,7 @@ ModelRunner::ModelRunner(
   weights->validateAllWeightsLoaded();
 
   torch::nativert::ExecutorConfig config;
+  config.modelName = modelName;
 
   executor_ = std::make_unique<Executor>(
       config, graph_, std::move(weights), pytorchStreamReader);
diff --git a/torch/nativert/__init__.py b/torch/nativert/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/nativert/backends/__init__.py b/torch/nativert/backends/__init__.py
index 0981407836bde..e69de29bb2d1d 100644
--- a/torch/nativert/backends/__init__.py
+++ b/torch/nativert/backends/__init__.py
@@ -1,4 +0,0 @@
-from .lowered_aoti_module import LoweredBackendModule
-
-
-__all__ = ["LoweredBackendModule"]
diff --git a/torch/nativert/backends/_lower_utils.py b/torch/nativert/backends/_lower_utils.py
new file mode 100644
index 0000000000000..fa97bc30b4a04
--- /dev/null
+++ b/torch/nativert/backends/_lower_utils.py
@@ -0,0 +1,101 @@
+import types
+
+import torch
+import torch.utils._pytree as pytree
+from torch.export import ExportedProgram
+from torch.export.pt2_archive._package import AOTI_FILES, package_pt2
+from torch.types import FileLike
+
+from ._lowered_aoti_module import LoweredBackendModule
+
+
+def get_new_ep_with_flat_inputs_outputs(ep: ExportedProgram) -> ExportedProgram:
+    class FlattenedModule(torch.nn.Module):
+        def __init__(
+            self,
+            original_module: torch.fx.GraphModule,
+            in_spec: pytree.TreeSpec,
+            out_spec: pytree.TreeSpec,
+        ) -> None:
+            super().__init__()
+            self.original_module = original_module
+            self.in_spec = in_spec
+            self.out_spec = out_spec
+
+        def forward(self, *flat_inputs):  # type: ignore[no-untyped-def]
+            # Unflatten inputs to original structure
+            inputs = pytree.tree_unflatten(flat_inputs, self.in_spec)
+            args, kwargs = inputs
+            outputs = self.original_module(*args, **kwargs)
+            # Flatten outputs
+            flat_outputs, _ = pytree.tree_flatten(outputs)
+            return tuple(flat_outputs)
+
+    flattened_module = FlattenedModule(
+        ep.module(), ep.call_spec.in_spec, ep.call_spec.out_spec
+    )
+    args, kwargs = ep.example_inputs
+    flat_inputs, _ = pytree.tree_flatten((args, kwargs))
+    flat_ep = torch.export.export(flattened_module, tuple(flat_inputs))
+
+    return flat_ep
+
+
+def lower_exported_program(
+    exported_program: ExportedProgram, model_name: str, backend_id: str
+) -> tuple[ExportedProgram, AOTI_FILES]:
+    """
+    Lower an exported program to AOTInductor and return a delegate ExportedProgram
+    with the `executorch_call_delegate` HOP
+    """
+    args, kwargs = exported_program.example_inputs
+    out_spec = exported_program.call_spec.out_spec
+    flat_ep = get_new_ep_with_flat_inputs_outputs(exported_program)
+    flat_inputs, _ = pytree.tree_flatten((args, kwargs))
+
+    aoti_files = torch._inductor.aot_compile(
+        flat_ep.module(), tuple(flat_inputs), options={"aot_inductor.package": True}
+    )
+    assert isinstance(aoti_files, list)
+
+    lowered_aoti_module = LoweredBackendModule(
+        flat_ep, backend_id, module_name=model_name
+    )
+
+    def patched_forward(self, *args, **kwargs):  # type: ignore[no-untyped-def]
+        flat_inputs, _ = pytree.tree_flatten((args, kwargs))
+        flat_outputs = torch._higher_order_ops.executorch_call_delegate(
+            self, *flat_inputs
+        )
+        if out_spec is not None and flat_outputs is not None:
+            return pytree.tree_unflatten(flat_outputs, out_spec)
+        else:
+            return flat_outputs
+
+    lowered_aoti_module.forward = types.MethodType(patched_forward, lowered_aoti_module)  # type: ignore[method-assign]
+
+    aoti_delegate_ep = torch.export.export(lowered_aoti_module, args, kwargs)
+
+    return aoti_delegate_ep, aoti_files
+
+
+def package_nativert_with_aoti_delegate(
+    f: FileLike,
+    model_name: str,
+    backend_id: str,
+    original_ep: ExportedProgram,
+    delegate_ep: ExportedProgram,
+    delegate_files: AOTI_FILES,
+) -> None:
+    """
+    Package a pt2 archive file that can be consumed by NativeRT with AOTI Delegate
+    """
+    package_pt2(
+        f,
+        exported_programs={
+            model_name: original_ep,
+            f"{model_name}-{backend_id}": delegate_ep,
+        },
+        aoti_files={f"{model_name}-{backend_id}": delegate_files},  # type: ignore[dict-item]
+    )
+    return
diff --git a/torch/nativert/backends/lowered_aoti_module.py b/torch/nativert/backends/_lowered_aoti_module.py
similarity index 89%
rename from torch/nativert/backends/lowered_aoti_module.py
rename to torch/nativert/backends/_lowered_aoti_module.py
index b0de0e3a26d1b..b3379f3b1a94e 100644
--- a/torch/nativert/backends/lowered_aoti_module.py
+++ b/torch/nativert/backends/_lowered_aoti_module.py
@@ -29,5 +29,5 @@ def module_name(self) -> Optional[str]:
     def original_module(self) -> ExportedProgram:
         return self._original_exported_program
 
-    def forward(self, *args):  # type: ignore[no-untyped-def]
-        return torch._higher_order_ops.executorch_call_delegate(self, *args)
+    def forward(self, *args, **kwargs):  # type: ignore[no-untyped-def]
+        return torch._higher_order_ops.executorch_call_delegate(self, *args, **kwargs)
diff --git a/torch/nativert/backends/lower_utils.py b/torch/nativert/backends/lower_utils.py
deleted file mode 100644
index 2b337f4f2c9d4..0000000000000
--- a/torch/nativert/backends/lower_utils.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import torch
-from torch.export import ExportedProgram
-from torch.export.pt2_archive._package import AOTI_FILES, package_pt2
-from torch.types import FileLike
-
-from .lowered_aoti_module import LoweredBackendModule
-
-
-def lower_exported_program(
-    exported_program: ExportedProgram, model_name: str, backend_id: str
-) -> tuple[ExportedProgram, AOTI_FILES]:
-    """
-    Lower an exported program to AOTInductor and return a delegate ExportedProgram
-    with the `executorch_call_delegate` HOP
-    """
-    args, kwargs = exported_program.example_inputs
-    aoti_files = torch._inductor.aot_compile(
-        exported_program.module(), args, kwargs, options={"aot_inductor.package": True}
-    )
-    assert isinstance(aoti_files, list)
-
-    lowered_aoti_module = LoweredBackendModule(
-        exported_program, backend_id, module_name=model_name
-    )
-
-    aoti_delegate_ep = torch.export.export(lowered_aoti_module, args, kwargs)
-
-    return aoti_delegate_ep, aoti_files
-
-
-def package_nativert_with_aoti_delegate(
-    f: FileLike,
-    model_name: str,
-    backend_id: str,
-    original_ep: ExportedProgram,
-    delegate_ep: ExportedProgram,
-    delegate_files: AOTI_FILES,
-) -> None:
-    """
-    Package a pt2 archive file that can be consumed by NativeRT with AOTI Delegate
-    """
-    package_pt2(
-        f,
-        exported_programs={
-            model_name: original_ep,
-            f"{model_name}-{backend_id}": delegate_ep,
-        },
-        aoti_files={f"{model_name}-{backend_id}": delegate_files},  # type: ignore[dict-item]
-    )
-    return
diff --git a/torch/nativert/detail/ITree.cpp b/torch/nativert/detail/ITree.cpp
index cd24ca78320fb..b24ee65f162b6 100644
--- a/torch/nativert/detail/ITree.cpp
+++ b/torch/nativert/detail/ITree.cpp
@@ -172,6 +172,148 @@ class PytreeNodeRegistry {
     registerNode(
         "torch.fx.immutable_collections.immutable_dict",
         getNodeDef("builtins.dict"));
+    // Register JaggedTensor pytree node
+    registerNode(
+        "torchrec.sparse.jagged_tensor.JaggedTensor",
+        NodeDef{
+            [](const c10::IValue& nested,
+               const ITreeSpec& spec,
+               std::vector<c10::IValue>& ivalues) {
+              // JaggedTensor has 4 fields: _values, _weights, _lengths,
+              // _offsets All fields are optional torch.Tensor except _values
+              TORCH_CHECK(nested.isObject(), "Expected JaggedTensor object");
+              const auto& obj = nested.toObjectRef();
+
+              // Extract the tensor fields in order: _values, _weights,
+              // _lengths, _offsets
+              TORCH_CHECK(
+                  spec.children().size() == 4,
+                  "JaggedTensor should have 4 children");
+
+              // Flatten each tensor field
+              itreeFlatten(obj.getAttr("_values"), spec.children(0), ivalues);
+              itreeFlatten(obj.getAttr("_weights"), spec.children(1), ivalues);
+              itreeFlatten(obj.getAttr("_lengths"), spec.children(2), ivalues);
+              itreeFlatten(obj.getAttr("_offsets"), spec.children(3), ivalues);
+            },
+            [](std::vector<c10::IValue> flats,
+               const nlohmann::json& obj) -> c10::IValue {
+              // Reconstruct JaggedTensor from flattened tensors
+              // This is a simplified reconstruction - in practice would need
+              // to call the actual JaggedTensor constructor
+              TORCH_INTERNAL_ASSERT_DEBUG_ONLY(obj.is_null());
+              TORCH_CHECK(
+                  flats.size() == 4, "JaggedTensor expects 4 tensor fields");
+
+              // Return a generic tuple for now - actual implementation would
+              // need to construct the JaggedTensor custom class
+              return c10::ivalue::Tuple::create(std::move(flats));
+            },
+            [](ITreeMapNoReturnFn fn,
+               const c10::IValue& nested,
+               const ITreeSpec& spec) {
+              TORCH_CHECK(nested.isObject(), "Expected JaggedTensor object");
+              const auto& obj = nested.toObjectRef();
+
+              TORCH_CHECK(
+                  spec.children().size() == 4,
+                  "JaggedTensor should have 4 children");
+
+              // Apply function to each tensor field
+              ivalueApply(fn, obj.getAttr("_values"), spec.children(0));
+              ivalueApply(fn, obj.getAttr("_weights"), spec.children(1));
+              ivalueApply(fn, obj.getAttr("_lengths"), spec.children(2));
+              ivalueApply(fn, obj.getAttr("_offsets"), spec.children(3));
+            }});
+
+    // Register KeyedJaggedTensor pytree node
+    registerNode(
+        "torchrec.sparse.jagged_tensor.KeyedJaggedTensor",
+        NodeDef{
+            [](const c10::IValue& nested,
+               const ITreeSpec& spec,
+               std::vector<c10::IValue>& ivalues) {
+              // KeyedJaggedTensor has 6 tensor fields plus keys context
+              // Fields: _values, _weights, _lengths, _offsets,
+              // _stride_per_key_per_rank, _inverse_indices tensor
+              TORCH_CHECK(
+                  nested.isObject(), "Expected KeyedJaggedTensor object");
+              const auto& obj = nested.toObjectRef();
+
+              // Extract the tensor fields in order
+              TORCH_CHECK(
+                  spec.children().size() == 6,
+                  "KeyedJaggedTensor should have 6 children");
+
+              // Flatten each tensor field
+              itreeFlatten(obj.getAttr("_values"), spec.children(0), ivalues);
+              itreeFlatten(obj.getAttr("_weights"), spec.children(1), ivalues);
+              itreeFlatten(obj.getAttr("_lengths"), spec.children(2), ivalues);
+              itreeFlatten(obj.getAttr("_offsets"), spec.children(3), ivalues);
+              itreeFlatten(
+                  obj.getAttr("_stride_per_key_per_rank"),
+                  spec.children(4),
+                  ivalues);
+              // For _inverse_indices, we need to extract the tensor part
+              // (second element of tuple)
+              auto inverse_indices = obj.getAttr("_inverse_indices");
+              if (!inverse_indices.isNone()) {
+                auto tuple = inverse_indices.toTuple();
+                itreeFlatten(tuple->elements()[1], spec.children(5), ivalues);
+              } else {
+                // Handle None case by adding a null tensor
+                itreeFlatten(c10::IValue(), spec.children(5), ivalues);
+              }
+            },
+            [](std::vector<c10::IValue> flats,
+               const nlohmann::json& obj) -> c10::IValue {
+              // Reconstruct KeyedJaggedTensor from flattened tensors and keys
+              // context
+              TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!obj.is_null());
+              TORCH_CHECK(
+                  flats.size() == 6,
+                  "KeyedJaggedTensor expects 6 tensor fields");
+
+              // The context should contain the keys list
+              // Return a generic tuple for now - actual implementation would
+              // need to construct the KeyedJaggedTensor custom class
+              return c10::ivalue::Tuple::create(std::move(flats));
+            },
+            [](ITreeMapNoReturnFn fn,
+               const c10::IValue& nested,
+               const ITreeSpec& spec) {
+              TORCH_CHECK(
+                  nested.isObject(), "Expected KeyedJaggedTensor object");
+              const auto& obj = nested.toObjectRef();
+
+              TORCH_CHECK(
+                  spec.children().size() == 6,
+                  "KeyedJaggedTensor should have 6 children");
+
+              // Apply function to each tensor field
+              ivalueApply(fn, obj.getAttr("_values"), spec.children(0));
+              ivalueApply(fn, obj.getAttr("_weights"), spec.children(1));
+              ivalueApply(fn, obj.getAttr("_lengths"), spec.children(2));
+              ivalueApply(fn, obj.getAttr("_offsets"), spec.children(3));
+              ivalueApply(
+                  fn,
+                  obj.getAttr("_stride_per_key_per_rank"),
+                  spec.children(4));
+              // For _inverse_indices, we need to apply to the tensor part
+              // (second element of tuple)
+              auto inverse_indices = obj.getAttr("_inverse_indices");
+              if (!inverse_indices.isNone()) {
+                auto tuple = inverse_indices.toTuple();
+                ivalueApply(fn, tuple->elements()[1], spec.children(5));
+              } else {
+                // Handle None case
+                ivalueApply(fn, c10::IValue(), spec.children(5));
+              }
+            },
+            [](std::string_view context) {
+              // Context contains the keys list as JSON
+              return nlohmann::json::parse(context);
+            }});
   }
   bool hasNodeDef(std::string_view typeName) const {
     return registry_.find(std::string{typeName}) != registry_.end();
diff --git a/torch/nativert/executor/AOTInductorDelegateExecutor.cpp b/torch/nativert/executor/AOTInductorDelegateExecutor.cpp
new file mode 100644
index 0000000000000..3d701f41c00e9
--- /dev/null
+++ b/torch/nativert/executor/AOTInductorDelegateExecutor.cpp
@@ -0,0 +1,168 @@
+#include <torch/nativert/executor/AOTInductorDelegateExecutor.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/record_function.h>
+#include <c10/util/Logging.h>
+
+#include <torch/csrc/export/pt2_archive_constants.h>
+#include <torch/csrc/utils/generated_serialization_types.h>
+#include <torch/nativert/executor/Weights.h>
+
+namespace torch::nativert {
+
+#ifndef NATIVERT_MSVC_TEST
+C10_DEFINE_TYPED_REGISTRY(
+    AOTIModelContainerRunnerRegistry,
+    c10::DeviceType,
+    torch::inductor::AOTIModelContainerRunner,
+    std::unique_ptr,
+    const std::string&,
+    size_t,
+    const std::string&,
+    const std::string&,
+    const bool)
+#endif // NATIVERT_MSVC_TEST
+
+namespace {
+template <typename T>
+std::optional<at::ScalarType> parse_precision(
+    const std::optional<T>& precision) {
+  if (precision) {
+    return static_cast<at::ScalarType>(*precision);
+  }
+  return std::nullopt;
+}
+
+c10::Device infer_target_device(const Node& node) {
+  std::vector<c10::Device> devices;
+
+  const auto& tensorValuesMeta = node.owningGraph()->tensorValuesMeta();
+  for (const auto* output : node.outputs()) {
+    if (auto it = tensorValuesMeta.find(std::string{output->name()});
+        it != tensorValuesMeta.end()) {
+      devices.emplace_back(it->second.device());
+    }
+  }
+
+  TORCH_CHECK(!devices.empty(), "AOTI node should have at least one output");
+  for (const auto i : c10::irange(1, devices.size())) {
+    if (!torch::nativert::isSameDevice(devices[0], devices[i])) {
+      LOG(WARNING) << "Node " << node
+                   << " has outputs on multiple devices: " << devices[0]
+                   << " and " << devices[i];
+    }
+  }
+
+  return devices[0];
+}
+
+std::unique_ptr<torch::inductor::AOTIModelContainerRunner>
+create_aoti_model_container_runner_cpu(
+    const std::string& model_so_path,
+    size_t num_models,
+    const std::string& device_str,
+    const std::string& cubin_dir,
+    const bool run_single_threaded) {
+  return std::make_unique<torch::inductor::AOTIModelContainerRunnerCpu>(
+      model_so_path,
+      num_models,
+      /* run_single_threaded= */ run_single_threaded);
+}
+
+} // namespace
+
+C10_REGISTER_TYPED_CREATOR(
+    AOTIModelContainerRunnerRegistry,
+    at::kCPU,
+    create_aoti_model_container_runner_cpu)
+
+AOTIDelegateExecutor::AOTIDelegateExecutor(
+    const Node& node,
+    const std::shared_ptr<Weights>& weights,
+    const ExecutorConfig& executorConfig,
+    caffe2::serialize::PyTorchStreamReader* packageReader,
+    const MakeProxyExecutorFn& makeProxyExecutorFunc)
+    : ETDelegateExecutor(torch::_export::archive_spec::AOTINDUCTOR_DIR, node) {
+  TORCH_CHECK(
+      packageReader, "Package reader cannot be null for lowered modules");
+
+  auto path = get_delegate_dir() + "/";
+
+  LOG(INFO) << "Loading aotinductor model from archive path: " << path;
+
+  std::optional<std::string> model_name = std::nullopt;
+  for (const auto& record : packageReader->getAllRecords()) {
+    if (c10::starts_with(record, path) && c10::ends_with(record, ".so")) {
+      model_name = record.substr(record.find_last_of("/\\") + 1);
+      break;
+    }
+  }
+
+  TORCH_CHECK(model_name.has_value(), "missing model .so in archive: ", path);
+  path.pop_back(); // remove trailing slash
+
+  std::string tmp_dir = extractToTemporaryFolder(*packageReader, path);
+  LOG(INFO) << "Extracted aot_inductor model to: " << tmp_dir;
+
+  std::string model_path = tmp_dir + "/" + *model_name;
+
+  LOG(INFO) << "Loading aotinductor model from model path: " << model_path;
+
+  auto device = infer_target_device(node);
+  LOG(INFO) << "Creating AOTI model container runner with device "
+            << device.str();
+
+  aoti_model_container_runner_ = AOTIModelContainerRunnerRegistry()->Create(
+      device.type(),
+      model_path,
+      /* num_models= */ executorConfig.maxNumConcurrentThreads,
+      device.str(),
+      /*cubin_dir=*/tmp_dir,
+      /*run_single_threaded=*/false);
+
+  for (const auto& [name, original_fqn] :
+       aoti_model_container_runner_->getConstantNamesToOriginalFQNs()) {
+    if (weights->contains(original_fqn)) {
+      weight_names_map_[original_fqn] = name;
+    } else {
+      LOG(WARNING)
+          << "AOTI's Constant " << original_fqn
+          << " is not found in weights, it's likely a constant created by AOTI constant folding. "
+          << "Valid weight FQNs are " << weights->toString();
+    }
+  }
+
+  // AOTI's DelegateExecutor doesn't need to call processWeights or
+  // commitWeights here because it's invoked from Executor's ctor already.
+}
+
+void AOTIDelegateExecutor::initWeights(std::shared_ptr<Weights> weights) {
+  // Do nothing for AOTI, as AOTI's .so already contains the weights.
+  LOG(INFO)
+      << "Skipping initWeights for AOTI to use original weights from .so file.";
+}
+
+void AOTIDelegateExecutor::processWeights(std::shared_ptr<Weights> weights) {
+  LOG(INFO) << "AOTIDelegateExecutor processing weights";
+  std::unordered_map<std::string, at::Tensor*> new_weights;
+  for (const auto& [original_fqn, name] : weight_names_map_) {
+    new_weights.emplace(name, &weights->at(original_fqn));
+  }
+
+  aoti_model_container_runner_->update_inactive_constant_buffer(new_weights);
+  aoti_model_container_runner_->run_const_fold(/*use_inactive=*/true);
+}
+
+void AOTIDelegateExecutor::commitWeights() {
+  LOG(INFO) << "AOTIDelegateExecutor committing weights";
+  aoti_model_container_runner_->swap_constant_buffer();
+}
+
+std::vector<at::Tensor> AOTIDelegateExecutor::run(
+    std::vector<at::Tensor>& inputs) {
+  RECORD_USER_SCOPE("sigmoid::AOTIDelegateExecutor::run");
+  std::vector<at::Tensor> outputs = aoti_model_container_runner_->run(inputs);
+  return outputs;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/AOTInductorDelegateExecutor.h b/torch/nativert/executor/AOTInductorDelegateExecutor.h
new file mode 100644
index 0000000000000..9c44ae85f34e6
--- /dev/null
+++ b/torch/nativert/executor/AOTInductorDelegateExecutor.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
+#include <torch/nativert/executor/ETDelegateExecutor.h>
+#include <torch/nativert/executor/ExecutorConfig.h>
+
+#ifdef USE_CUDA
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
+#endif
+
+namespace torch::nativert {
+
+class AOTIDelegateExecutor : public ETDelegateExecutor {
+ public:
+  explicit AOTIDelegateExecutor(
+      const Node& node,
+      const std::shared_ptr<Weights>& weights,
+      const ExecutorConfig& executorConfig,
+      caffe2::serialize::PyTorchStreamReader* packageReader,
+      const MakeProxyExecutorFn& makeProxyExecutorFunc);
+  ~AOTIDelegateExecutor() override = default;
+
+  void processWeights(std::shared_ptr<Weights> weights) override;
+  void initWeights(std::shared_ptr<Weights> weights) override;
+  void commitWeights() override;
+
+  std::vector<at::Tensor> run(std::vector<at::Tensor>& inputs) override;
+
+ private:
+  std::unique_ptr<torch::inductor::AOTIModelContainerRunner>
+      aoti_model_container_runner_;
+
+  // key is weight's original fqn, value is weight's name in AOTI
+  std::unordered_map<std::string, std::string> weight_names_map_;
+};
+
+C10_DECLARE_TYPED_REGISTRY(
+    AOTIModelContainerRunnerRegistry,
+    c10::DeviceType,
+    torch::inductor::AOTIModelContainerRunner,
+    std::unique_ptr,
+    const std::string&,
+    size_t,
+    const std::string&,
+    const std::string&,
+    const bool);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/AOTInductorModelContainerCudaShim.cpp b/torch/nativert/executor/AOTInductorModelContainerCudaShim.cpp
new file mode 100644
index 0000000000000..cf415571c8c26
--- /dev/null
+++ b/torch/nativert/executor/AOTInductorModelContainerCudaShim.cpp
@@ -0,0 +1,24 @@
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
+#include <torch/nativert/executor/AOTInductorDelegateExecutor.h>
+
+namespace torch::nativert {
+
+namespace {
+std::unique_ptr<torch::inductor::AOTIModelContainerRunner>
+create_aoti_model_container_runner_cuda(
+    const std::string& model_so_path,
+    size_t num_models,
+    const std::string& device_str,
+    const std::string& cubin_dir,
+    const bool run_single_threaded) {
+  return std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
+      model_so_path, num_models, device_str, cubin_dir, run_single_threaded);
+}
+} // namespace
+
+C10_REGISTER_TYPED_CREATOR(
+    AOTIModelContainerRunnerRegistry,
+    at::kCUDA,
+    create_aoti_model_container_runner_cuda)
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/ETDelegateExecutor.h b/torch/nativert/executor/ETDelegateExecutor.h
new file mode 100644
index 0000000000000..2a71cf242429f
--- /dev/null
+++ b/torch/nativert/executor/ETDelegateExecutor.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/nativert/executor/DelegateExecutor.h>
+#include <torch/nativert/executor/ExecutorConfig.h>
+
+namespace torch::nativert {
+
+class ETDelegateExecutor : public DelegateExecutor {
+ public:
+  explicit ETDelegateExecutor(
+      const std::string_view& dir_prefix,
+      const Node& node)
+      : delegate_dir_([&]() {
+          const std::string* path =
+              std::get_if<std::string>(&node.attributes()[0].value);
+          TORCH_CHECK(
+              path != nullptr,
+              "et hop's first attribute should correspond to it's path");
+          return std::string(dir_prefix) + *path;
+        }()) {
+    VLOG(1) << "ETDelegateExecutor: " << delegate_dir_;
+  }
+
+  ~ETDelegateExecutor() override = default;
+
+  const std::string& get_delegate_dir() {
+    return delegate_dir_;
+  }
+
+ private:
+  std::string delegate_dir_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/PlacementUtils.cpp b/torch/nativert/executor/PlacementUtils.cpp
index e73224b4f4f52..afd1009c62833 100644
--- a/torch/nativert/executor/PlacementUtils.cpp
+++ b/torch/nativert/executor/PlacementUtils.cpp
@@ -17,6 +17,9 @@ bool isSameDevice(const c10::Device& a, const c10::Device& b) {
       return false;
     }
   }
+  if (a.is_meta()) {
+    return b.is_meta();
+  }
   TORCH_CHECK(false, "Unsupported device type", a, " and ", b);
   return false;
 }
diff --git a/torch/nativert/executor/Weights.cpp b/torch/nativert/executor/Weights.cpp
index d685cc1a78163..4a64935945c4f 100644
--- a/torch/nativert/executor/Weights.cpp
+++ b/torch/nativert/executor/Weights.cpp
@@ -337,6 +337,13 @@ void Weights::loadStateDict(
 
 void Weights::validateValue(const std::string& name, const at::Tensor& newValue)
     const {
+  validateValue(name, newValue, /*skipDeviceCheck=*/false);
+}
+
+void Weights::validateValue(
+    const std::string& name,
+    const at::Tensor& newValue,
+    bool skipDeviceCheck) const {
   auto& weightMeta = weightsMeta_.at(name);
 
   TORCH_CHECK(
@@ -360,23 +367,32 @@ void Weights::validateValue(const std::string& name, const at::Tensor& newValue)
       " vs ",
       newValue.dtype());
 
-  auto targetDevice = weightMeta.device();
-  if (targetDevice.is_cpu() && targetDevice.has_index()) {
-    LOG(WARNING) << "Target device is cpu but has index: " << targetDevice;
+  if (!skipDeviceCheck) {
+    auto targetDevice = weightMeta.device();
+    if (targetDevice.is_cpu() && targetDevice.has_index()) {
+      LOG(WARNING) << "Target device is cpu but has index: " << targetDevice;
+    }
+    TORCH_CHECK(
+        isSameDevice(targetDevice, newValue.device()),
+        "Mismatched device for ",
+        name,
+        ": ",
+        targetDevice,
+        " vs ",
+        newValue.device());
   }
-  TORCH_CHECK(
-      isSameDevice(targetDevice, newValue.device()),
-      "Mismatched device for ",
-      name,
-      ": ",
-      targetDevice,
-      " vs ",
-      newValue.device());
 }
 
 void Weights::setValue(const std::string& name, const at::Tensor& newValue) {
+  setValue(name, newValue, /*skipDeviceCheck=*/false);
+}
+
+void Weights::setValue(
+    const std::string& name,
+    const at::Tensor& newValue,
+    bool skipDeviceCheck) {
   if (allValues_.find(name) != allValues_.end()) {
-    validateValue(name, newValue);
+    validateValue(name, newValue, skipDeviceCheck);
   } else {
     LOG(WARNING) << name << " is not found in the registered weights";
   }
diff --git a/torch/nativert/executor/Weights.h b/torch/nativert/executor/Weights.h
index e3c1469c0d5c5..7791a329ec498 100644
--- a/torch/nativert/executor/Weights.h
+++ b/torch/nativert/executor/Weights.h
@@ -66,6 +66,10 @@ class Weights {
    * Replace the value stored at the weight with name "name".
    */
   void setValue(const std::string& name, const at::Tensor& newValue);
+  void setValue(
+      const std::string& name,
+      const at::Tensor& newValue,
+      bool skipDeviceCheck);
 
   /*
    * Update the value stored at the weight with name "name".
@@ -77,6 +81,10 @@ class Weights {
       const std::unordered_map<std::string, at::Tensor>& newValues);
 
   void validateValue(const std::string& name, const at::Tensor& newValue) const;
+  void validateValue(
+      const std::string& name,
+      const at::Tensor& newValue,
+      bool skipDeviceCheck) const;
 
   void validateAllWeightsLoaded();
 
diff --git a/torch/nativert/executor/triton/CpuTritonKernelManager.cpp b/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
index 1f8d394ecf391..c212539e49304 100644
--- a/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
+++ b/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
@@ -1,5 +1,6 @@
-#include <torch/nativert/executor/triton/CpuTritonKernelManager.h>
+#include <torch/nativert/executor/triton/TritonKernelManager.h>
 
+#include <c10/util/FbcodeMaps.h>
 #include <c10/util/Logging.h>
 
 #ifndef _WIN32
@@ -35,6 +36,43 @@ char* _dlerror() {
 
 } // namespace
 
+typedef void* kernel_ptr_t;
+typedef void (
+    *launcher_ptr_t)(uint32_t, uint32_t, uint32_t, void**, kernel_ptr_t);
+
+struct DlcloseDeleter {
+  void operator()(void* p) const {
+    if (p) {
+#if defined(_WIN32)
+      TORCH_CHECK(false, "Windows is not supported");
+#else
+      dlclose(p);
+#endif
+    }
+  }
+};
+
+class CpuTritonKernelManager final : public TritonKernelManager {
+ public:
+  CpuTritonKernelManager(
+      std::string kernel_name,
+      std::string kernel_bin_path,
+      std::string kernel_launcher_bin_path);
+  ~CpuTritonKernelManager() final = default;
+  void launch(const LaunchParams& launch_params, void** args) final;
+
+ private:
+  void load();
+
+  kernel_ptr_t kernel_fn_{nullptr};
+  launcher_ptr_t launcher_fn_{nullptr};
+
+  std::unique_ptr<void, DlcloseDeleter> kernel_handle_{nullptr};
+  std::unique_ptr<void, DlcloseDeleter> launcher_handle_{nullptr};
+
+  std::string kernel_launcher_bin_path_;
+};
+
 CpuTritonKernelManager::CpuTritonKernelManager(
     std::string kernel_name,
     std::string kernel_bin_path,
@@ -88,4 +126,21 @@ void CpuTritonKernelManager::launch(
       kernel_fn_);
 }
 
+namespace {
+std::unique_ptr<TritonKernelManager> create_cpu_triton_kernel_manager(
+    std::string kernel_name,
+    std::string kernel_bin_path,
+    std::string kernel_launcher_bin_path) {
+  return std::make_unique<CpuTritonKernelManager>(
+      std::move(kernel_name),
+      std::move(kernel_bin_path),
+      std::move(kernel_launcher_bin_path));
+}
+} // namespace
+
+C10_REGISTER_TYPED_CREATOR(
+    TritonKernelManagerRegistry,
+    at::kCPU,
+    create_cpu_triton_kernel_manager)
+
 } // namespace torch::nativert
diff --git a/torch/nativert/executor/triton/CpuTritonKernelManager.h b/torch/nativert/executor/triton/CpuTritonKernelManager.h
deleted file mode 100644
index 45b3327c878e4..0000000000000
--- a/torch/nativert/executor/triton/CpuTritonKernelManager.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#pragma once
-
-#include <torch/nativert/executor/triton/TritonKernelManager.h>
-
-#include <c10/core/Device.h>
-#include <c10/util/FbcodeMaps.h>
-
-#ifndef _WIN32
-#include <dlfcn.h>
-#endif
-
-typedef void* kernel_ptr_t;
-typedef void (
-    *launcher_ptr_t)(uint32_t, uint32_t, uint32_t, void**, kernel_ptr_t);
-
-namespace torch::nativert {
-
-struct DlcloseDeleter {
-  void operator()(void* p) const {
-    if (p) {
-#if defined(_WIN32)
-      TORCH_CHECK(false, "Windows is not supported");
-#else
-      dlclose(p);
-#endif
-    }
-  }
-};
-
-class CpuTritonKernelManager final : public TritonKernelManager {
- public:
-  CpuTritonKernelManager(
-      std::string kernel_name,
-      std::string kernel_bin_path,
-      std::string kernel_launcher_bin_path);
-  ~CpuTritonKernelManager() final = default;
-  void launch(const LaunchParams& launch_params, void** args) final;
-
- private:
-  void load();
-
-  kernel_ptr_t kernel_fn_{nullptr};
-  launcher_ptr_t launcher_fn_{nullptr};
-
-  std::unique_ptr<void, DlcloseDeleter> kernel_handle_{nullptr};
-  std::unique_ptr<void, DlcloseDeleter> launcher_handle_{nullptr};
-
-  std::string kernel_launcher_bin_path_;
-};
-
-} // namespace torch::nativert
diff --git a/torch/nativert/executor/triton/CudaTritonKernelManager.cpp b/torch/nativert/executor/triton/CudaTritonKernelManager.cpp
index 47f72ce0c5e37..d18efcc178f46 100644
--- a/torch/nativert/executor/triton/CudaTritonKernelManager.cpp
+++ b/torch/nativert/executor/triton/CudaTritonKernelManager.cpp
@@ -29,7 +29,7 @@ namespace torch::nativert {
 class CudaKernelInputs final : public KernelInputs {
  public:
   CudaKernelInputs(size_t num_args, size_t num_attrs)
-      : KernelInputs(num_args, num_attrs), arg_ptrs_(num_args) {};
+      : KernelInputs(num_args, num_attrs), arg_ptrs_(num_args) {}
   ~CudaKernelInputs() final = default;
 
   void add_arg(void* arg) override {
@@ -73,7 +73,7 @@ CudaTritonKernelManager::CudaTritonKernelManager(
   TORCH_CHECK(
       at::globalContext().hasCUDA() || at::globalContext().hasHIP(),
       "cuda or hip required");
-};
+}
 
 CudaTritonKernelManager::~CudaTritonKernelManager() {
   const auto& nvrtc = get_nvrtc();
@@ -137,19 +137,31 @@ void CudaTritonKernelManager::launch(
       nullptr));
 }
 
-static std::unique_ptr<TritonKernelManager> _create_cuda_triton_kernel_manager(
+namespace {
+std::unique_ptr<TritonKernelManager> create_cuda_triton_kernel_manager(
     std::string kernel_name,
-    std::string kernel_bin_path) {
+    std::string kernel_bin_path,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
+    [[maybe_unused]] std::string kernel_launcher_bin_path) {
   return std::make_unique<CudaTritonKernelManager>(
       std::move(kernel_name), std::move(kernel_bin_path));
 }
+} // namespace
 
-} // namespace torch::nativert
+#ifdef USE_ROCM
 
-namespace {
-static bool _initialized_cuda_triton_kernel_manager = []() {
-  torch::nativert::create_cuda_triton_kernel_manager =
-      &torch::nativert::_create_cuda_triton_kernel_manager;
-  return true;
-}();
-} // namespace
+C10_REGISTER_TYPED_CREATOR(
+    TritonKernelManagerRegistry,
+    at::kHIP,
+    create_cuda_triton_kernel_manager)
+
+#else
+
+C10_REGISTER_TYPED_CREATOR(
+    TritonKernelManagerRegistry,
+    at::kCUDA,
+    create_cuda_triton_kernel_manager)
+
+#endif // USE_ROCM
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/triton/TritonKernelManager.h b/torch/nativert/executor/triton/TritonKernelManager.h
index ffa8e2573bc02..976fb3921f0ab 100644
--- a/torch/nativert/executor/triton/TritonKernelManager.h
+++ b/torch/nativert/executor/triton/TritonKernelManager.h
@@ -2,7 +2,9 @@
 
 #include <string>
 
+#include <c10/core/DeviceType.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
 
 namespace torch::nativert {
 
@@ -69,7 +71,13 @@ class TritonKernelManager {
   std::string kernel_name_, kernel_bin_path_;
 };
 
-inline std::unique_ptr<TritonKernelManager> (
-    *create_cuda_triton_kernel_manager)(std::string, std::string) = nullptr;
+C10_DECLARE_TYPED_REGISTRY(
+    TritonKernelManagerRegistry,
+    c10::DeviceType,
+    TritonKernelManager,
+    std::unique_ptr,
+    std::string /* kernel_name */,
+    std::string /* kernel_bin_path */,
+    std::string /* kernel_launcher_bin_path */);
 
 } // namespace torch::nativert
diff --git a/torch/nativert/graph/Graph.cpp b/torch/nativert/graph/Graph.cpp
index ee5fbaca11b91..260af58a2a492 100644
--- a/torch/nativert/graph/Graph.cpp
+++ b/torch/nativert/graph/Graph.cpp
@@ -683,6 +683,36 @@ void Graph::applyDevicePlacement(const Placement& placement) {
   }
 }
 
+void Graph::overrideWeightsDevice(
+    const std::unordered_map<std::string, std::optional<c10::Device>>&
+        submodNameToDevice) {
+  for (auto& [weightName, weightMeta] : weightsMeta_) {
+    for (auto& [name, device] : submodNameToDevice) {
+      if (device.has_value() && weightMeta.device() != device &&
+          c10::starts_with(weightName, name) &&
+          (weightName == name || weightName[name.length()] == '.')) {
+        LOG(INFO) << "Overriding " << weightName << " from "
+                  << weightMeta.device() << " to device " << device.value();
+        weightMeta.setDevice(device.value());
+        break;
+      }
+    }
+  }
+
+  for (auto& [tensorName, tensorMeta] : tensorValuesMeta_) {
+    for (auto& [name, device] : submodNameToDevice) {
+      if (device.has_value() && tensorMeta.device() != device &&
+          c10::starts_with(tensorName, name) &&
+          (tensorName == name || tensorName[name.length()] == '.')) {
+        LOG(INFO) << "Overriding " << tensorName << " from "
+                  << tensorMeta.device() << " to device " << device.value();
+        tensorMeta.setDevice(device.value());
+        break;
+      }
+    }
+  }
+}
+
 Node* Graph::nodeAfter(Node* n) {
   TORCH_CHECK(n->owningGraph() == this);
   if (n == outputNode_) {
diff --git a/torch/nativert/graph/Graph.h b/torch/nativert/graph/Graph.h
index a86e973621994..49335ec6aebd9 100644
--- a/torch/nativert/graph/Graph.h
+++ b/torch/nativert/graph/Graph.h
@@ -442,6 +442,11 @@ class Graph {
 
   void applyDevicePlacement(const Placement& placement);
 
+  // Override all weights in the graph if matching name is found in the map.
+  void overrideWeightsDevice(
+      const std::unordered_map<std::string, std::optional<c10::Device>>&
+          submodNameToDevice);
+
   std::string getUniqueValueName();
 
   ValueId getNextValueId() {
diff --git a/torch/nativert/graph/TensorMeta.h b/torch/nativert/graph/TensorMeta.h
index 7fe9a88c731af..5d1d39e5d2d60 100644
--- a/torch/nativert/graph/TensorMeta.h
+++ b/torch/nativert/graph/TensorMeta.h
@@ -64,6 +64,11 @@ class TensorMeta {
     return device_;
   }
 
+  // override device according to placement
+  void setDevice(c10::Device device) {
+    device_ = device;
+  }
+
   c10::TensorOptions asTensorOptions() const {
     return c10::TensorOptions().dtype(dtype_).layout(layout_).requires_grad(
         requiresGrad_);
diff --git a/torch/nativert/kernels/ETCallDelegateKernel.cpp b/torch/nativert/kernels/ETCallDelegateKernel.cpp
new file mode 100644
index 0000000000000..3b43c3e8deb54
--- /dev/null
+++ b/torch/nativert/kernels/ETCallDelegateKernel.cpp
@@ -0,0 +1,43 @@
+#include <torch/nativert/kernels/ETCallDelegateKernel.h>
+
+#include <torch/nativert/executor/ETDelegateExecutor.h>
+
+namespace torch::nativert {
+
+ETCallDelegateKernel::ETCallDelegateKernel(
+    const Node* node,
+    ETDelegateExecutor& delegateExecutor)
+    : OpKernel(node), delegateExecutor_(delegateExecutor) {
+  for (const auto& input : node_->inputs()) {
+    TORCH_CHECK(input.value->type() == Type::Kind::Tensor);
+  }
+
+  for (const auto* output : node_->outputs()) {
+    TORCH_CHECK(output->type() == Type::Kind::Tensor);
+  }
+}
+
+void ETCallDelegateKernel::computeInternal(
+    ExecutionFrame& executionFrame) const {
+  std::vector<at::Tensor> inputs;
+  inputs.reserve(numInputs());
+
+  for (const auto& input : node_->inputs()) {
+    inputs.emplace_back(executionFrame.getTensor(input.value->id()));
+  }
+
+  auto outputs = delegateExecutor_.run(inputs);
+  const auto& node_outputs = node_->outputs();
+  TORCH_CHECK(outputs.size() == node_outputs.size());
+
+  size_t i = 0;
+  for (auto begin = std::make_move_iterator(outputs.begin()),
+            end = std::make_move_iterator(outputs.end());
+       begin != end;
+       ++begin) {
+    executionFrame.setIValue(node_outputs[i]->id(), *begin);
+    i++;
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/ETCallDelegateKernel.h b/torch/nativert/kernels/ETCallDelegateKernel.h
new file mode 100644
index 0000000000000..c7eefc3b4eef1
--- /dev/null
+++ b/torch/nativert/kernels/ETCallDelegateKernel.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/OpKernel.h>
+
+namespace torch::nativert {
+
+class ETDelegateExecutor;
+
+class ETCallDelegateKernel : public OpKernel {
+ public:
+  explicit ETCallDelegateKernel(
+      const Node* node,
+      ETDelegateExecutor& delegateExecutor);
+
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+
+ private:
+  ETDelegateExecutor& delegateExecutor_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/KernelHandlerRegistry.cpp b/torch/nativert/kernels/KernelHandlerRegistry.cpp
index 653ca5dfcb816..3ac176a81bc3a 100644
--- a/torch/nativert/kernels/KernelHandlerRegistry.cpp
+++ b/torch/nativert/kernels/KernelHandlerRegistry.cpp
@@ -12,6 +12,10 @@
 #include <torch/nativert/kernels/KernelFactory.h>
 #include <torch/nativert/kernels/KernelRegistry.h>
 
+#include <torch/csrc/inductor/aoti_torch/oss_proxy_executor.h>
+#include <torch/nativert/executor/AOTInductorDelegateExecutor.h>
+#include <torch/nativert/kernels/ETCallDelegateKernel.h>
+
 namespace torch::nativert {
 
 namespace {
@@ -31,6 +35,14 @@ std::string maybeRevisedStaticDispatchTarget(const Node& node) {
   }
   return std::string(node.target());
 }
+
+std::unique_ptr<torch::aot_inductor::ProxyExecutor> make_proxy_executor(
+    const std::string& filename,
+    bool is_cpu,
+    std::optional<std::unordered_map<std::string, c10::IValue>> custom_objs) {
+  return std::make_unique<torch::aot_inductor::OSSProxyExecutor>(
+      filename, is_cpu, std::move(custom_objs));
+}
 } // namespace
 
 void register_kernel_handlers() {
@@ -62,6 +74,35 @@ void register_kernel_handlers() {
                       ->Create(maybeRevisedStaticDispatchTarget(node), &node),
                   nullptr};
             }));
+    KernelFactory::registerHandler(
+        "et_delegate",
+        KernelFactoryHandler(
+            [](const Node& node,
+               const torch::nativert::ExecutorConfig& /* executorConfig */) {
+              return c10::starts_with(
+                  node.target(),
+                  "torch.ops.higher_order.executorch_call_delegate");
+            },
+            [](const Node& node,
+               // NOLINTNEXTLINE(performance-unnecessary-value-param)
+               std::shared_ptr<Weights> weights,
+               const torch::nativert::ExecutorConfig& executorConfig,
+               caffe2::serialize::PyTorchStreamReader* packageReader)
+                -> std::pair<
+                    KernelFactoryHandler::OpKernelPtr,
+                    KernelFactoryHandler::DelegateExecutorPtr> {
+              auto delegateExecutor = std::make_unique<AOTIDelegateExecutor>(
+                  node,
+                  weights,
+                  executorConfig,
+                  packageReader,
+                  make_proxy_executor);
+
+              return {
+                  std::make_unique<ETCallDelegateKernel>(
+                      &node, *delegateExecutor),
+                  std::move(delegateExecutor)};
+            }));
   });
 }
 
diff --git a/torch/nativert/kernels/TritonKernel.cpp b/torch/nativert/kernels/TritonKernel.cpp
index 84fbf09a37f43..3843036aead97 100644
--- a/torch/nativert/kernels/TritonKernel.cpp
+++ b/torch/nativert/kernels/TritonKernel.cpp
@@ -16,10 +16,20 @@
 #include <ATen/ops/empty.h>
 #endif
 
-#include <torch/nativert/executor/triton/CpuTritonKernelManager.h>
-
 namespace torch::nativert {
 
+// in this case, we want to use the symbol from torch_cpu.dll
+#ifndef NATIVERT_MSVC_TEST
+C10_DEFINE_TYPED_REGISTRY(
+    TritonKernelManagerRegistry,
+    c10::DeviceType,
+    TritonKernelManager,
+    std::unique_ptr,
+    std::string /* kernel_name */,
+    std::string /* kernel_bin_path */,
+    std::string /* kernel_launcher_bin_path */)
+#endif
+
 TritonKernel::TritonKernel(
     const Node* node,
     caffe2::serialize::PyTorchStreamReader* reader)
@@ -74,27 +84,28 @@ TritonKernel::TritonKernel(
   auto tmp_dir = extractToTemporaryFolder(*reader, kernel_prefix) + "/";
 
   if (reader->hasRecord(kernel_prefix + "/" + kernel_name + ".cubin")) {
+    loader_ = TritonKernelManagerRegistry()->Create(
+        at::kCUDA, kernel_name, tmp_dir + kernel_name + ".cubin", "");
     TORCH_CHECK(
-        create_cuda_triton_kernel_manager != nullptr,
+        loader_ != nullptr,
         "couldn't find cuda loader -- is this a gpu build?");
-    loader_ = create_cuda_triton_kernel_manager(
-        kernel_name, tmp_dir + kernel_name + ".cubin");
-  }
-
-  if (reader->hasRecord(kernel_prefix + "/" + kernel_name + ".hsaco")) {
+  } else if (reader->hasRecord(kernel_prefix + "/" + kernel_name + ".hsaco")) {
+    loader_ = TritonKernelManagerRegistry()->Create(
+        at::kHIP, kernel_name, tmp_dir + kernel_name + ".hsaco", "");
     TORCH_CHECK(
-        create_cuda_triton_kernel_manager != nullptr,
+        loader_ != nullptr,
         "couldn't find cuda loader -- is this a gpu build?");
-    loader_ = create_cuda_triton_kernel_manager(
-        kernel_name, tmp_dir + kernel_name + ".hsaco");
-  }
-
-  if (loader_ == nullptr) {
-    loader_ = std::unique_ptr<TritonKernelManager>(new CpuTritonKernelManager(
+  } else {
+    loader_ = TritonKernelManagerRegistry()->Create(
+        at::kCPU,
         kernel_name,
         tmp_dir + kernel_name + ".so",
-        tmp_dir + kernel_name + ".launcher.so"));
+        tmp_dir + kernel_name + ".launcher.so");
   }
+
+  TORCH_CHECK(
+      loader_ != nullptr,
+      "couldn't find triton kernel loader -- are you trying to run gpu kernels on a cpu build?");
 }
 
 TritonKernel::~TritonKernel() = default;
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index 19b1fe670835f..2c961f109c150 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -413,6 +413,36 @@ def _flatten_sig(input, start_dim=0, end_dim=-1):
 
         return inp.reshape(*new_shape)
 
+    # Handle NestedTensor share_memory_.
+    if func.__name__ == "share_memory_":
+        nt = args[0]
+
+        if nt.is_cuda:
+            return nt
+
+        names, _ = nt.__tensor_flatten__()
+        with torch._C.DisableTorchFunctionSubclass():
+            for name in names:
+                component = getattr(nt, name, None)
+                if component is not None:
+                    component.share_memory_()
+        return nt
+
+    # Handle NestedTensor is_shared.
+    if func.__name__ == "is_shared":
+        nt = args[0]
+
+        if nt.is_cuda:
+            return False
+
+        names, _ = nt.__tensor_flatten__()
+        if not names:
+            return False
+        return all(
+            getattr(nt, name) is not None and getattr(nt, name).is_shared()
+            for name in names
+        )
+
     # Handle nested-specific input validation for CompositeImplicit rms_norm
     if func.__name__ == "rms_norm":
 
@@ -2101,6 +2131,19 @@ def all_any_max_min_default(func, *args, **kwargs):
     return func(inp._values, **new_kwargs)
 
 
+@register_jagged_func(
+    [torch.ops.aten._is_all_true.default, torch.ops.aten._is_any_true.default],
+    "self: jt_all",
+)
+def _is_true_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(  # type: ignore[misc]
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    return func(inp._values)
+
+
 @register_jagged_func(torch.ops.aten.min.dim, "self: jt_all, dim: any, keepdim: any?")
 def min_dim(func, *args, **kwargs):
     _, new_kwargs = normalize_function(  # type: ignore[misc]
@@ -2665,144 +2708,6 @@ def matmul_backward_default(func, *args, **kwargs):
     return (grad_self, grad_other)
 
 
-from torch._higher_order_ops.flex_attention import (
-    flex_attention as flex_attention_hop,
-    flex_attention_backward as flex_attention_backward_hop,
-)
-from torch.fx.graph_module import GraphModule
-
-
-@flex_attention_hop.py_impl(NestedTensor)  # type: ignore[misc]
-def flex_njt(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    score_mod: Callable,
-    block_mask: Tuple,
-    scale: float,
-    kernel_options: Dict[str, Any],
-    score_mod_other_buffers: Tuple = (),
-    mask_mod_other_buffers: Tuple = (),
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    assert query.dim() == 4 and key.dim() == 4 and value.dim() == 4
-
-    # TODO: Support this if needed; determine if NJT buffers need be unwrapped as dense.
-    if any(
-        isinstance(buf, torch.Tensor) and buf.is_nested
-        for buf in score_mod_other_buffers + mask_mod_other_buffers
-    ):
-        raise RuntimeError(
-            "flex_attention(): Nested tensor score_mod / mask_mod buffers are not "
-            "currently supported. Please file an issue if this is important to you."
-        )
-
-    # Always set them since 0 sized elements are not handled gracefully
-    kernel_options = {**kernel_options, "OUTPUT_MAX": True, "OUTPUT_LOGSUMEXP": True}
-
-    # need to pass dense tensor of shape (B, n_heads, sum(seq_len), D)
-    output = flex_attention_hop(
-        query.values().unsqueeze(0),
-        key.values().unsqueeze(0),
-        value.values().unsqueeze(0),
-        score_mod=score_mod,
-        block_mask=block_mask,
-        scale=scale,
-        kernel_options=kernel_options,
-        score_mod_other_buffers=score_mod_other_buffers,
-        mask_mod_other_buffers=mask_mod_other_buffers,
-    )
-
-    # wrap outputs as NJT
-    output_njt = torch.nested.nested_tensor_from_jagged(
-        output[0].transpose(1, 2).squeeze(0),
-        query._offsets,  # type: ignore[attr-defined]
-        query._lengths,  # type: ignore[attr-defined]
-        min_seqlen=query._maybe_min_seqlen,  # type: ignore[attr-defined]
-        max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
-    ).transpose(1, 2)
-
-    logsumexp_njt = torch.nested.nested_tensor_from_jagged(
-        output[1].transpose(1, 2).squeeze(0),
-        query._offsets,  # type: ignore[attr-defined]
-        query._lengths,  # type: ignore[attr-defined]
-        min_seqlen=query._maybe_min_seqlen,  # type: ignore[attr-defined]
-        max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
-    ).transpose(1, 2)
-
-    max_scores_njt = torch.nested.nested_tensor_from_jagged(
-        output[2].transpose(1, 2).squeeze(0),
-        query._offsets,  # type: ignore[attr-defined]
-        query._lengths,  # type: ignore[attr-defined]
-        min_seqlen=query._maybe_min_seqlen,  # type: ignore[attr-defined]
-        max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
-    ).transpose(1, 2)
-
-    return (output_njt, logsumexp_njt, max_scores_njt)
-
-
-@flex_attention_backward_hop.py_impl(NestedTensor)  # type: ignore[misc]
-def flex_njt_backward(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    out: torch.Tensor,
-    logsumexp: torch.Tensor,
-    grad_out: torch.Tensor,
-    grad_logsumexp: torch.Tensor,
-    fw_graph: Union[Callable, GraphModule],
-    joint_graph: GraphModule,
-    block_mask: Tuple,
-    scale: float,
-    kernel_options: Dict[str, Any],
-    score_mod_other_buffers: Tuple = (),
-    mask_mod_other_buffers: Tuple = (),
-) -> Tuple[
-    torch.Tensor, torch.Tensor, torch.Tensor, Tuple[Optional[torch.Tensor], ...]
-]:
-    output = flex_attention_backward_hop(
-        query.values().unsqueeze(0),
-        key.values().unsqueeze(0),
-        value.values().unsqueeze(0),
-        out=out.values().unsqueeze(0),
-        logsumexp=logsumexp.values().unsqueeze(0),
-        grad_out=grad_out.values().unsqueeze(0),
-        grad_logsumexp=grad_logsumexp.values().unsqueeze(0),
-        fw_graph=fw_graph,
-        joint_graph=joint_graph,
-        block_mask=block_mask,
-        scale=scale,
-        kernel_options=kernel_options,
-        score_mod_other_buffers=score_mod_other_buffers,
-        mask_mod_other_buffers=mask_mod_other_buffers,
-    )
-
-    # wrap grads as NJTs
-    dense_q_grad, dense_k_grad, dense_v_grad, score_mod_other_buffer_grads = output
-    njt_q_grad = torch.nested.nested_tensor_from_jagged(
-        dense_q_grad.transpose(1, 2).squeeze(0),
-        query._offsets,  # type: ignore[attr-defined]
-        query._lengths,  # type: ignore[attr-defined]
-        min_seqlen=query._maybe_min_seqlen,  # type: ignore[attr-defined]
-        max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
-    ).transpose(1, 2)
-    njt_k_grad = torch.nested.nested_tensor_from_jagged(
-        dense_k_grad.transpose(1, 2).squeeze(0),
-        key._offsets,  # type: ignore[attr-defined]
-        key._lengths,  # type: ignore[attr-defined]
-        min_seqlen=key._maybe_min_seqlen,  # type: ignore[attr-defined]
-        max_seqlen=key._maybe_max_seqlen,  # type: ignore[attr-defined]
-    ).transpose(1, 2)
-    njt_v_grad = torch.nested.nested_tensor_from_jagged(
-        dense_v_grad.transpose(1, 2).squeeze(0),
-        value._offsets,  # type: ignore[attr-defined]
-        value._lengths,  # type: ignore[attr-defined]
-        min_seqlen=value._maybe_min_seqlen,  # type: ignore[attr-defined]
-        max_seqlen=value._maybe_max_seqlen,  # type: ignore[attr-defined]
-    ).transpose(1, 2)
-
-    return (njt_q_grad, njt_k_grad, njt_v_grad, score_mod_other_buffer_grads)
-
-
 # Make the dummy available on the C++ side.
 @register_jagged_func(torch.ops.aten._nested_get_jagged_dummy.default, "self: any")
 def _nested_get_jagged_dummy(func, *args, **kwargs):
diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py
index ccd5697aa49c5..5c4c8184a06fb 100644
--- a/torch/nn/attention/flex_attention.py
+++ b/torch/nn/attention/flex_attention.py
@@ -74,7 +74,6 @@ def _warn_once(
     "FlexKernelOptions",
     "create_block_mask",
     "create_mask",
-    "create_nested_block_mask",
     "or_masks",
     "and_masks",
     "noop_mask",
@@ -649,6 +648,15 @@ def causal_mask(b, h, q_idx, kv_idx):
                 assert new_block_mask.kv_num_blocks.shape == (2, 1, 1)
                 assert new_block_mask.kv_indices.shape == (2, 1, 1, 4)
         """
+        index = (index,) if not isinstance(index, tuple) else index
+        padded = (*index, slice(None), slice(None), slice(None))[:3]
+        sizes = self.kv_num_blocks.shape[:3]
+        index = tuple(
+            (slice(i + n, i + n + 1) if -n <= i < 0 else slice(i, i + 1))
+            if isinstance(i, int)
+            else i
+            for i, n in zip(padded, sizes)
+        )
         new_kv_num_blocks = self.kv_num_blocks[index]
         new_kv_indices = self.kv_indices[index]
         if self.full_kv_num_blocks is not None:
@@ -1111,179 +1119,6 @@ def _create_empty_block_mask(query: Tensor, key: Tensor) -> BlockMask:
     )
 
 
-def _nested_mod_func_adapter(
-    orig_mod_func: Union[_score_mod_signature, _mask_mod_signature],
-    q_nt: torch.Tensor,
-    kv_nt: torch.Tensor,
-    is_score_mod: bool,
-) -> Union[_score_mod_signature, _mask_mod_signature]:
-    r"""Adapter to convert a score_mod / mask_mod to be NJT-compatible. The given mod func
-    should be written as if operating over a single sequence at a item. This adapter will
-    handle conversion from indices operating over a "stacked sequence" of length ``sum(S)``
-    for sequence length ``S`` in the NJT to "sequence relative" indices in range ``[0, S)``.
-
-    Args:
-        orig_mod_func (Callable): Function to modify attention scores. It takes four or five
-            arguments, depending on whether a mask_mod or score_mod func is passed.
-        q_nt (torch.Tensor): Jagged layout nested tensor (NJT) that defines the sequence length
-            structure for query.
-        kv_nt (torch.Tensor): Jagged layout nested tensor (NJT) that defines the sequence length
-            structure for key / value.
-        is_score_mod (bool): Indicates whether the mod function is a score_mod.
-
-    Returns:
-        nt_score_mod: An NJT-compatible version of orig_score_mod
-    """
-
-    # Used to convert indices within the "stacked" sequence (range [0, sum(*)))
-    # to "sequence local" indices (range [0, S) for each S).
-    def _build_seq_idx(offsets, total_length):
-        range_tensor = torch.arange(
-            total_length, device=offsets.device, dtype=torch.int32
-        )
-
-        # Use searchsorted to find the index for each position
-        # NB: This assumes offsets[0] to offsets[-1] spans the packed dim of values.
-        # If we ever loosen this restriction, this logic will need to be updated.
-        seq_idx = torch.searchsorted(offsets, range_tensor, right=True) - 1
-        return seq_idx
-
-    q_offsets = q_nt._offsets  # type: ignore[attr-defined]
-    kv_offsets = kv_nt._offsets  # type: ignore[attr-defined]
-    q_seq_idx = _build_seq_idx(q_offsets, q_nt._values.shape[q_nt._ragged_idx - 1])  # type: ignore[attr-defined]
-    if q_nt is kv_nt:
-        kv_seq_idx = q_seq_idx
-    else:
-        # cross attention case
-        kv_seq_idx = _build_seq_idx(
-            kv_offsets,
-            kv_nt._values.shape[kv_nt._ragged_idx - 1],  # type: ignore[attr-defined]
-        )
-
-    # Converts q_idx / kv_idx from [0, total_length) -> [0, S), where S refers
-    # to the sequence length for each sequence in the NJT, for use in given
-    # score_mod. This allows the user to write a score_mod as if it were
-    # operating on a single sequence and the "stacked sequence" is split
-    # automatically into individual sequences for them.
-    if is_score_mod:
-
-        def nt_score_mod(score, b, h, q_idx, kv_idx):
-            b_nested = q_seq_idx[q_idx]
-            q_nested = q_idx - q_offsets[q_seq_idx[q_idx]]
-            kv_nested = kv_idx - kv_offsets[kv_seq_idx[kv_idx]]
-            is_same_sequence = q_seq_idx[q_idx] == kv_seq_idx[kv_idx]
-            return torch.where(
-                is_same_sequence,
-                orig_mod_func(score, b_nested, h, q_nested, kv_nested),  # type: ignore[call-arg]
-                # don't allow inter-sequence attention
-                float("-inf"),
-            )
-
-        return nt_score_mod
-    else:
-
-        def nt_mask_mod(b, h, q_idx, kv_idx):
-            b_nested = q_seq_idx[q_idx]
-            q_nested = q_idx - q_offsets[q_seq_idx[q_idx]]
-            kv_nested = kv_idx - kv_offsets[kv_seq_idx[kv_idx]]
-            # don't allow inter-sequence attention
-            is_same_sequence = q_seq_idx[q_idx] == kv_seq_idx[kv_idx]
-            return orig_mod_func(b_nested, h, q_nested, kv_nested) & is_same_sequence  # type: ignore[call-arg]
-
-        return nt_mask_mod
-
-
-def create_nested_block_mask(
-    mask_mod: _mask_mod_signature,
-    B: Optional[int],
-    H: Optional[int],
-    q_nt: torch.Tensor,
-    kv_nt: Optional[torch.Tensor] = None,
-    BLOCK_SIZE: Union[int, tuple[int, int]] = _DEFAULT_SPARSE_BLOCK_SIZE,
-    _compile=False,
-) -> BlockMask:
-    r"""This function creates a nested tensor compatible block mask tuple from a mask_mod
-    function. The returned BlockMask will be on the device specified by the input nested tensor.
-
-    Args:
-        mask_mod (Callable): mask_mod function. This is a callable that defines the
-            masking pattern for the attention mechanism. It takes four arguments:
-            b (batch size), h (number of heads), q_idx (query index), and kv_idx (key/value index).
-            It should return a boolean tensor indicating which attention connections are allowed
-            (True) or masked out (False).
-        B (int): Batch size.
-        H (int): Number of query heads.
-        q_nt (torch.Tensor): Jagged layout nested tensor (NJT) that defines the sequence length
-            structure for query. The block mask will be constructed to operate on a "stacked
-            sequence" of length ``sum(S)`` for sequence length ``S`` from the NJT.
-        kv_nt (torch.Tensor): Jagged layout nested tensor (NJT) that defines the sequence length
-            structure for key / value, allowing for cross attention. The block mask will be
-            constructed to operate on a "stacked sequence" of length ``sum(S)`` for sequence
-            length ``S`` from the NJT. If this is None, ``q_nt`` is used to define the structure
-            for key / value as well. Default: None
-        BLOCK_SIZE (int or tuple[int, int]): Block size for the block mask. If a single int is
-            provided it is used for both query and key/value.
-
-    Returns:
-        BlockMask:  A BlockMask object that contains the block mask information.
-
-    Example Usage:
-        .. code-block:: python
-
-            # shape (B, num_heads, seq_len*, D) where seq_len* varies across the batch
-            query = torch.nested.nested_tensor(..., layout=torch.jagged)
-            key = torch.nested.nested_tensor(..., layout=torch.jagged)
-            value = torch.nested.nested_tensor(..., layout=torch.jagged)
-
-
-            def causal_mask(b, h, q_idx, kv_idx):
-                return q_idx >= kv_idx
-
-
-            block_mask = create_nested_block_mask(
-                causal_mask, 1, 1, query, _compile=True
-            )
-            output = flex_attention(query, key, value, block_mask=block_mask)
-
-        .. code-block:: python
-
-            # shape (B, num_heads, seq_len*, D) where seq_len* varies across the batch
-            query = torch.nested.nested_tensor(..., layout=torch.jagged)
-            key = torch.nested.nested_tensor(..., layout=torch.jagged)
-            value = torch.nested.nested_tensor(..., layout=torch.jagged)
-
-
-            def causal_mask(b, h, q_idx, kv_idx):
-                return q_idx >= kv_idx
-
-
-            # cross attention case: pass both query and key/value NJTs
-            block_mask = create_nested_block_mask(
-                causal_mask, 1, 1, query, key, _compile=True
-            )
-            output = flex_attention(query, key, value, block_mask=block_mask)
-    """
-    # use same structure for kv as for q by default
-    if kv_nt is None:
-        kv_nt = q_nt
-    if q_nt.device != kv_nt.device:
-        raise ValueError(
-            "create_nested_block_mask(): Expected q_nt and kv_nt to be on the same device"
-        )
-    return create_block_mask(
-        _nested_mod_func_adapter(mask_mod, q_nt, kv_nt, is_score_mod=False),  # type: ignore[arg-type]
-        B,
-        H,
-        q_nt._values.shape[q_nt._ragged_idx - 1],  # type: ignore[attr-defined]
-        kv_nt._values.shape[kv_nt._ragged_idx - 1],  # type: ignore[attr-defined]
-        device=q_nt.device,  # type: ignore[arg-type]
-        # compile is important so we don't materialize a mask_tensor of
-        # shape (1, 1, total_seqlen, total_seqlen)
-        BLOCK_SIZE=BLOCK_SIZE,
-        _compile=_compile,
-    )
-
-
 def _apply_kernel_options(
     query: Tensor,
     key: Tensor,
@@ -1359,25 +1194,6 @@ def _validate_device(query: Tensor, key: Tensor, value: Tensor):
         )
 
 
-def _validate_nestedness(query: Tensor, key: Tensor, value: Tensor):
-    # Currently, inputs can only be all nested or no nested.
-    if query.is_nested != key.is_nested or key.is_nested != value.is_nested:
-        raise ValueError(
-            "FlexAttention does not support mixed nested tensor / non-nested tensor inputs. "
-            "Please file an issue requesting this if it is important to you."
-        )
-
-    if (
-        (query.is_nested and query._lengths is not None)  # type: ignore[attr-defined]
-        or (key.is_nested and key._lengths is not None)  # type: ignore[attr-defined]
-        or (value.is_nested and value._lengths is not None)  # type: ignore[attr-defined]
-    ):
-        raise ValueError(
-            "FlexAttention does not support nested tensors that are non-contiguous with holes. "
-            "Please file an issue requesting this if it is important to you."
-        )
-
-
 def _enforce_mem_layouts(
     query: Tensor, key: Tensor, value: Tensor
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -1517,7 +1333,6 @@ def score_mod(
     _validate_sdpa_input(query, key, value)
     _validate_embed_dim(query, key, value)
     _validate_device(query, key, value)
-    _validate_nestedness(query, key, value)
     query, key, value = _enforce_mem_layouts(query, key, value)
     if query.dim() != 4 or key.dim() != 4 or value.dim() != 4:
         raise NotImplementedError("NYI: query, key, and value must be 4D tensors")
@@ -1552,14 +1367,6 @@ def score_mod(
 
     if score_mod is None:
         score_mod = _identity
-    elif query.is_nested:
-        # use same NJT if the ragged structures for sequence lengths match between q and kv
-        kv = (
-            query
-            if query.size(query._ragged_idx) == key.size(query._ragged_idx)  # type: ignore[attr-defined]
-            else key
-        )
-        score_mod = _nested_mod_func_adapter(score_mod, query, kv, is_score_mod=True)  # type: ignore[assignment]
 
     if block_mask is None:
         block_mask = _create_empty_block_mask(query, key)
@@ -1570,12 +1377,6 @@ def score_mod(
     ):
         # This corresponds to the case where we essentially have a "no-op" block mask.
         pass
-    elif query.is_nested:
-        if block_mask.shape[-2] != query._values.size(query._ragged_idx - 1):  # type: ignore[attr-defined]
-            raise RuntimeError(
-                f"block_mask of shape {block_mask.shape} is not compatible with nested tensor input "
-                f"with total sequence length of {query._values.size(query._ragged_idx - 1)}"  # type: ignore[attr-defined]
-            )
     else:
         block_mask_q_len = block_mask.shape[-2]
         block_mask_kv_len = block_mask.shape[-1]
@@ -1612,7 +1413,7 @@ def score_mod(
     elif return_lse and return_aux is None:
         _warn_once(
             "deprecated_return_lse",
-            "return_lse is deprecated and will be removed in v2.7. "
+            "return_lse is deprecated and will be removed in v2.10. "
             "Please use return_aux=AuxRequest(lse=True) instead.",
             category=FutureWarning,
         )
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index d975641e80e84..12f20517c2ad6 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -785,6 +785,7 @@ def forward(self, input: Tensor) -> Tensor:
             # currently only GPU/PrivateUse1 input is supported
             if input.device.type not in [
                 "cuda",
+                "hpu",
                 "xpu",
                 torch._C._get_privateuse1_backend_name(),
             ]:
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index a3c867d533d6b..2a2d130590ef9 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -214,8 +214,6 @@ def __init__(
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
-        if in1_features <= 0:
-            raise ValueError(f"in1_features must be > 0, but got {in1_features}")
         self.in1_features = in1_features
         self.in2_features = in2_features
         self.out_features = out_features
@@ -233,6 +231,10 @@ def reset_parameters(self) -> None:
         """
         Resets parameters based on their initialization used in ``__init__``.
         """
+        if self.in1_features <= 0:
+            raise ValueError(
+                f"in1_features must be > 0, but got (in1_features={self.in1_features})"
+            )
         bound = 1 / math.sqrt(self.weight.size(1))
         init.uniform_(self.weight, -bound, bound)
         if self.bias is not None:
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index f0c4914782f39..bd00a09722c63 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -476,7 +476,7 @@ def forward(self, x):
     call_super_init: bool = False
     _compiled_call_impl: Optional[Callable] = None
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         """Initialize internal Module state, shared by both nn.Module and ScriptModule."""
         torch._C._log_api_usage_once("python.nn_module")
 
@@ -929,8 +929,12 @@ def _apply(self, fn, recurse=True):
             for module in self.children():
                 module._apply(fn)
 
+        from torch._subclasses.fake_tensor import FakeTensor
+
         def compute_should_use_set_data(tensor, tensor_applied) -> bool:
-            if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
+            if torch._has_compatible_shallow_copy_type(
+                tensor, tensor_applied
+            ) and not isinstance(tensor_applied, FakeTensor):
                 # If the new tensor has compatible tensor type as the existing tensor,
                 # the current behavior is to change the tensor in-place using `.data =`,
                 # and the future behavior is to overwrite the existing tensor. However,
@@ -957,8 +961,6 @@ def compute_should_use_set_data(tensor, tensor_applied) -> bool:
                 param_applied = fn(param)
             p_should_use_set_data = compute_should_use_set_data(param, param_applied)
 
-            from torch._subclasses.fake_tensor import FakeTensor
-
             # subclasses may have multiple child tensors so we need to use swap_tensors
             p_should_use_swap_tensors = (
                 should_use_swap_tensors
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
index 4d66a7a71d895..aa8db823e1185 100644
--- a/torch/nn/parallel/parallel_apply.py
+++ b/torch/nn/parallel/parallel_apply.py
@@ -58,7 +58,9 @@ def parallel_apply(
     else:
         devices = [None] * len(modules)
     devices = [_get_device_index(x, True) for x in devices]
-    streams = [torch.cuda.current_stream(x) for x in devices]
+    streams = [torch.accelerator.current_stream(x) for x in devices]
+    assert torch.accelerator.is_available(), "No available accelerator found."
+    device_type = torch.accelerator.current_accelerator().type  # type: ignore[union-attr]
     lock = threading.Lock()
     results = {}
     grad_enabled, autocast_enabled = (
@@ -72,7 +74,7 @@ def _worker(
         input: Any,
         kwargs: dict[str, Any],
         device: Optional[Union[int, torch.device]] = None,
-        stream: Optional[torch.cuda.Stream] = None,
+        stream: Optional[torch.Stream] = None,
     ) -> None:
         torch.set_grad_enabled(grad_enabled)
         if device is None:
@@ -85,13 +87,15 @@ def _worker(
                     )
                 return
             device = t.get_device()
+        if isinstance(device, torch.device):
+            device = device.index
         if stream is None:
-            stream = torch.cuda.current_stream(device)
+            stream = torch.accelerator.current_stream(device)
         try:
             with (
-                torch.cuda.device(device),
-                torch.cuda.stream(stream),
-                torch.amp.autocast("cuda", enabled=autocast_enabled),
+                torch.accelerator.device_index(device),
+                stream,
+                torch.amp.autocast(device_type, enabled=autocast_enabled),
             ):
                 # this also avoids accidental slicing of `input` if it is a Tensor
                 if not isinstance(input, (list, tuple)):
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 11f7106b31491..25de247c6df68 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -185,6 +185,14 @@ def __init__(
                     f"original.dtype: {original.dtype}\n"
                     f"right_inverse(original).dtype: {new.dtype}"
                 )
+
+            if original.device != new.device:
+                raise ValueError(
+                    "When `right_inverse` outputs one tensor, it may not change the device.\n"
+                    f"original.device: {original.device}\n"
+                    f"right_inverse(original).device: {new.device}"
+                )
+
             # Set the original to original so that the user does not need to re-register the parameter
             # manually in the optimiser
             with torch.no_grad():
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index f9c955bef6d6f..668f47c15bc82 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -65,15 +65,10 @@ def export(
     f: str | os.PathLike | None = None,
     *,
     kwargs: dict[str, Any] | None = None,
-    export_params: bool = True,
     verbose: bool | None = None,
     input_names: Sequence[str] | None = None,
     output_names: Sequence[str] | None = None,
     opset_version: int | None = None,
-    dynamic_axes: Mapping[str, Mapping[int, str]]
-    | Mapping[str, Sequence[int]]
-    | None = None,
-    keep_initializers_as_inputs: bool = False,
     dynamo: bool = True,
     # Dynamo only options
     external_data: bool = True,
@@ -86,7 +81,13 @@ def export(
     profile: bool = False,
     dump_exported_program: bool = False,
     artifacts_dir: str | os.PathLike = ".",
-    fallback: bool = True,
+    fallback: bool = False,
+    # BC options
+    export_params: bool = True,
+    keep_initializers_as_inputs: bool = False,
+    dynamic_axes: Mapping[str, Mapping[int, str]]
+    | Mapping[str, Sequence[int]]
+    | None = None,
     # Deprecated options
     training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
     operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
@@ -99,7 +100,7 @@ def export(
 
     Setting ``dynamo=True`` enables the new ONNX export logic
     which is based on :class:`torch.export.ExportedProgram` and a more modern
-    set of translation logic. This is the recommended way to export models
+    set of translation logic. This is the recommended and default way to export models
     to ONNX.
 
     When ``dynamo=True``:
@@ -109,21 +110,17 @@ def export(
     #. If the model is already an ExportedProgram, it will be used as-is.
     #. Use :func:`torch.export.export` and set ``strict=False``.
     #. Use :func:`torch.export.export` and set ``strict=True``.
-    #. Use ``draft_export`` which removes some soundness guarantees in data-dependent
-       operations to allow export to proceed. You will get a warning if the exporter
-       encounters any unsound data-dependent operation.
-    #. Use :func:`torch.jit.trace` to trace the model then convert to ExportedProgram.
-       This is the most unsound strategy but may be useful for converting TorchScript
-       models to ONNX.
 
     Args:
         model: The model to be exported.
         args: Example positional inputs. Any non-Tensor arguments will be hard-coded into the
             exported model; any Tensor arguments will become inputs of the exported model,
             in the order they occur in the tuple.
-        f: Path to the output ONNX model file. E.g. "model.onnx".
+        f: Path to the output ONNX model file. E.g. "model.onnx". This argument is kept for
+            backward compatibility. It is recommended to leave unspecified (None)
+            and use the returned :class:`torch.onnx.ONNXProgram` to serialize the model
+            to a file instead.
         kwargs: Optional example keyword inputs.
-        export_params: If false, parameters (weights) will not be exported.
         verbose: Whether to enable verbose logging.
         input_names: names to assign to the input nodes of the graph, in order.
         output_names: names to assign to the output nodes of the graph, in order.
@@ -133,7 +130,52 @@ def export(
             of the runtime backend or compiler you want to run the exported model with.
             Leave as default (``None``) to use the recommended version, or refer to
             the ONNX operators documentation for more information.
+        dynamo: Whether to export the model with ``torch.export`` ExportedProgram instead of TorchScript.
+        external_data: Whether to save the model weights as an external data file.
+            This is required for models with large weights that exceed the ONNX file size limit (2GB).
+            When False, the weights are saved in the ONNX file with the model architecture.
+        dynamic_shapes: A dictionary or a tuple of dynamic shapes for the model inputs. Refer to
+            :func:`torch.export.export` for more details. This is only used (and preferred) when dynamo is True.
+            Note that dynamic_shapes is designed to be used when the model is exported with dynamo=True, while
+            dynamic_axes is used when dynamo=False.
+        custom_translation_table: A dictionary of custom decompositions for operators in the model.
+            The dictionary should have the callable target in the fx Node as the key (e.g. ``torch.ops.aten.stft.default``),
+            and the value should be a function that builds that graph using ONNX Script. This option
+            is only valid when dynamo is True.
+        report: Whether to generate a markdown report for the export process. This option
+            is only valid when dynamo is True.
+        optimize: Whether to optimize the exported model. This option
+            is only valid when dynamo is True. Default is True.
+        verify: Whether to verify the exported model using ONNX Runtime. This option
+            is only valid when dynamo is True.
+        profile: Whether to profile the export process. This option
+            is only valid when dynamo is True.
+        dump_exported_program: Whether to dump the :class:`torch.export.ExportedProgram` to a file.
+            This is useful for debugging the exporter. This option is only valid when dynamo is True.
+        artifacts_dir: The directory to save the debugging artifacts like the report and the serialized
+            exported program. This option is only valid when dynamo is True.
+        fallback: Whether to fallback to the TorchScript exporter if the dynamo exporter fails.
+            This option is only valid when dynamo is True. When fallback is enabled, It is
+            recommended to set dynamic_axes even when dynamic_shapes is provided.
+        export_params: **When ``f`` is specified**: If false, parameters (weights) will not be exported.
+
+            You can also leave it unspecified and use the returned :class:`torch.onnx.ONNXProgram`
+            to control how initializers are treated when serializing the model.
+        keep_initializers_as_inputs: **When ``f`` is specified**: If True, all the
+            initializers (typically corresponding to model weights) in the
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the user inputs are added as inputs.
+
+            Set this to True if you intend to supply model weights at runtime.
+            Set it to False if the weights are static to allow for better optimizations
+            (e.g. constant folding) by backends/runtimes.
+
+            You can also leave it unspecified and use the returned :class:`torch.onnx.ONNXProgram`
+            to control how initializers are treated when serializing the model.
         dynamic_axes:
+            Prefer specifying ``dynamic_shapes`` when ``dynamo=True`` and when ``fallback``
+            is not enabled.
 
             By default the exported model will have the shapes of all input and output tensors
             set to exactly match those given in ``args``. To specify axes of tensors as
@@ -215,84 +257,12 @@ def forward(self, x):
                           dim_param: "sum_dynamic_axes_1"  # axis 0
                 ...
 
-        keep_initializers_as_inputs: If True, all the
-            initializers (typically corresponding to model weights) in the
-            exported graph will also be added as inputs to the graph. If False,
-            then initializers are not added as inputs to the graph, and only
-            the user inputs are added as inputs.
-
-            Set this to True if you intend to supply model weights at runtime.
-            Set it to False if the weights are static to allow for better optimizations
-            (e.g. constant folding) by backends/runtimes.
-
-        dynamo: Whether to export the model with ``torch.export`` ExportedProgram instead of TorchScript.
-        external_data: Whether to save the model weights as an external data file.
-            This is required for models with large weights that exceed the ONNX file size limit (2GB).
-            When False, the weights are saved in the ONNX file with the model architecture.
-        dynamic_shapes: A dictionary or a tuple of dynamic shapes for the model inputs. Refer to
-            :func:`torch.export.export` for more details. This is only used (and preferred) when dynamo is True.
-            Note that dynamic_shapes is designed to be used when the model is exported with dynamo=True, while
-            dynamic_axes is used when dynamo=False.
-        custom_translation_table: A dictionary of custom decompositions for operators in the model.
-            The dictionary should have the callable target in the fx Node as the key (e.g. ``torch.ops.aten.stft.default``),
-            and the value should be a function that builds that graph using ONNX Script. This option
-            is only valid when dynamo is True.
-        report: Whether to generate a markdown report for the export process. This option
-            is only valid when dynamo is True.
-        optimize: Whether to optimize the exported model. This option
-            is only valid when dynamo is True. Default is True.
-        verify: Whether to verify the exported model using ONNX Runtime. This option
-            is only valid when dynamo is True.
-        profile: Whether to profile the export process. This option
-            is only valid when dynamo is True.
-        dump_exported_program: Whether to dump the :class:`torch.export.ExportedProgram` to a file.
-            This is useful for debugging the exporter. This option is only valid when dynamo is True.
-        artifacts_dir: The directory to save the debugging artifacts like the report and the serialized
-            exported program. This option is only valid when dynamo is True.
-        fallback: Whether to fallback to the TorchScript exporter if the dynamo exporter fails.
-            This option is only valid when dynamo is True. When fallback is enabled, It is
-            recommended to set dynamic_axes even when dynamic_shapes is provided.
-
         training: Deprecated option. Instead, set the training mode of the model before exporting.
         operator_export_type: Deprecated option. Only ONNX is supported.
         do_constant_folding: Deprecated option.
-        custom_opsets: Deprecated.
-            A dictionary:
-
-            * KEY (str): opset domain name
-            * VALUE (int): opset version
-
-            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
-            the opset version is set to 1. Only custom opset domain name and version should be
-            indicated through this argument.
+        custom_opsets: Deprecated option.
         export_modules_as_functions: Deprecated option.
-
-            Flag to enable
-            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
-            particular types of modules to export as local functions in ONNX.
-            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
-            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
-            Module variables will be exported as function attributes. There are two categories of function
-            attributes.
-
-            1. Annotated attributes: class variables that have type annotations via
-            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
-            will be exported as attributes.
-            Annotated attributes are not used inside the subgraph of ONNX local function because
-            they are not created by PyTorch JIT tracing, but they may be used by consumers
-            to determine whether or not to replace the function with a particular fused kernel.
-
-            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
-            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
-            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
-
-            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
-            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
-            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
-                only if the type of the ``nn.Module`` is found in the set.
-        autograd_inlining: Deprecated.
-            Flag used to control whether to inline autograd functions.
-            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
+        autograd_inlining: Deprecated option.
 
     Returns:
         :class:`torch.onnx.ONNXProgram` if dynamo is True, otherwise None.
@@ -305,6 +275,8 @@ def forward(self, x):
         *autograd_inlining* is now deprecated.
     .. versionchanged:: 2.7
         *optimize* is now True by default.
+    .. versionchanged:: 2.9
+        *dynamo* is now True by default.
     """
     if dynamo is True or isinstance(model, torch.export.ExportedProgram):
         from torch.onnx._internal.exporter import _compat
diff --git a/torch/onnx/_internal/_lazy_import.py b/torch/onnx/_internal/_lazy_import.py
index 5e2340fe4c42d..c1e58dc34fc03 100644
--- a/torch/onnx/_internal/_lazy_import.py
+++ b/torch/onnx/_internal/_lazy_import.py
@@ -28,7 +28,7 @@ def __getattr__(self, attr: str) -> object:
 # NOTE: Add additional used imports here.
 if TYPE_CHECKING:
     import onnx
-    import onnx_ir  # type: ignore[import-untyped]
+    import onnx_ir  # type: ignore[import-untyped, import-not-found]
     import onnxscript
     import onnxscript._framework_apis.torch_2_9 as onnxscript_apis
 
diff --git a/torch/onnx/_internal/exporter/_compat.py b/torch/onnx/_internal/exporter/_compat.py
index fe18f42e17b92..0bc0c6182fca0 100644
--- a/torch/onnx/_internal/exporter/_compat.py
+++ b/torch/onnx/_internal/exporter/_compat.py
@@ -67,7 +67,7 @@ def export_compat(
     profile: bool = False,
     dump_exported_program: bool = False,
     artifacts_dir: str | os.PathLike = ".",
-    fallback: bool = True,
+    fallback: bool = False,
     # Legacy export parameters for fallback
     legacy_export_kwargs: dict[str, Any] | None = None,
 ) -> _onnx_program.ONNXProgram:
diff --git a/torch/onnx/_internal/exporter/_core.py b/torch/onnx/_internal/exporter/_core.py
index 85aa513c6d023..0cf27560784fa 100644
--- a/torch/onnx/_internal/exporter/_core.py
+++ b/torch/onnx/_internal/exporter/_core.py
@@ -79,7 +79,7 @@
     f"""\
     Failed to export the model with torch.export. {_BLUE}This is step 1/3{_END} of exporting the model to ONNX. Next steps:
     - Modify the model code for `torch.export.export` to succeed. Refer to https://pytorch.org/docs/stable/generated/exportdb/index.html for more information.
-    - Debug `torch.export.export` and summit a PR to PyTorch.
+    - Debug `torch.export.export` and submit a PR to PyTorch.
     - Create an issue in the PyTorch GitHub repository against the {_BLUE}*torch.export*{_END} component and attach the full error stack as well as reproduction scripts."""
 )
 
@@ -94,7 +94,7 @@
     f"""\
     Failed to convert the exported program to an ONNX model. {_BLUE}This is step 3/3{_END} of exporting the model to ONNX. Next steps:
     - If there is a missing ONNX function, implement it and register it to the registry.
-    - If there is an internal error during ONNX conversion, debug the error and summit a PR to PyTorch.
+    - If there is an internal error during ONNX conversion, debug the error and submit a PR to PyTorch.
     - Create an error report with `torch.onnx.export(..., report=True)`, and save the ExportedProgram as a pt2 file. Create an issue in the PyTorch GitHub repository against the {_BLUE}*onnx*{_END} component. Attach the error report and the pt2 model."""
 )
 
diff --git a/torch/onnx/_internal/exporter/_onnx_program.py b/torch/onnx/_internal/exporter/_onnx_program.py
index 35d51e8329499..62333289fad83 100644
--- a/torch/onnx/_internal/exporter/_onnx_program.py
+++ b/torch/onnx/_internal/exporter/_onnx_program.py
@@ -128,7 +128,7 @@ def _to_numpy_array(input: torch.Tensor | int | float | str | bool) -> np.ndarra
 
 def _from_numpy_array(array: np.ndarray) -> torch.Tensor:
     """Convert a NumPy array to a PyTorch tensor."""
-    import ml_dtypes
+    import ml_dtypes  # type: ignore[import-not-found]
     import numpy as np
 
     if array.dtype == ml_dtypes.bfloat16:
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/nn.py b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
index 90815bc18d6e3..1ea9a4161f431 100644
--- a/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
@@ -170,6 +170,9 @@ def aten_scaled_dot_product_attention_23(
     if is_causal:
         attn_mask = _causal_attention_mask(query, key, op23)
 
+    if enable_gqa:
+        key, value = _attention_repeat_kv_for_group_query(query, key, value, op23)
+
     if attn_mask is None:
         return _aten_scaled_dot_product_attention_no_mask_onnx(
             query, key, value, scale, dropout_p, op23
@@ -180,6 +183,67 @@ def aten_scaled_dot_product_attention_23(
     )
 
 
+def _attention_repeat_kv_for_group_query(
+    query: TFloat, key: TFloat, value: TFloat, op: Opset
+) -> tuple[TFloat, TFloat]:
+    """Expand key and value for group query attention.
+
+    repeat_interleave is applied on key and value to match the number of heads in query.
+
+    Args:
+        query: Tensor of shape [B, q_num_heads, q_S, E]
+        key: Tensor of shape [B, k_num_heads, kv_S, E]
+        value: Tensor of shape [B, v_num_heads, kv_S, E]
+
+    Returns:
+        Tuple of (expanded_key, expanded_value) where:
+            - expanded_key: Tensor of shape [B, q_num_heads, kv_S, E]
+            - expanded_value: Tensor of shape [B, q_num_heads, kv_S, E
+    """
+
+    assert (
+        query.shape[1] > key.shape[1] == value.shape[1]
+        and query.shape[1] % key.shape[1] == 0
+    ), (
+        "SDPA (GQA or MQA) requires q_num_heads > kv_num_heads & q_num_heads % kv_num_heads == 0"
+    )
+
+    # NOTE: QKV are expected to be 4D tensors
+
+    batch_size = op.Shape(query, start=0, end=1)  # [B]
+    q_num_heads = op.Shape(query, start=1, end=2)  # [Hq]
+    kv_num_heads = op.Shape(key, start=1, end=2)  # [Hk]
+    qk_head_size = op.Shape(key, start=3, end=4)  # [Dk]
+    v_head_size = op.Shape(value, start=3, end=4)  # [Dv]
+    new_kv_seq_len = op.Shape(key, start=2, end=3)  # [T]
+
+    interleave_dim = op.Div(q_num_heads, kv_num_heads)  # Hq / Hk
+    two = op.Constant(value_int=2)
+    k_unsqueezed = op.Unsqueeze(key, two)  # [B, Hk, 1, T, Dk]
+    v_unsqueezed = op.Unsqueeze(value, two)  # [B, Hv, 1, T, Dv]
+
+    k_expand_shape = op.Concat(
+        batch_size, kv_num_heads, interleave_dim, new_kv_seq_len, qk_head_size, axis=0
+    )
+    k_expand = op.Expand(k_unsqueezed, k_expand_shape)
+    v_expand_shape = op.Concat(
+        batch_size, kv_num_heads, interleave_dim, new_kv_seq_len, v_head_size, axis=0
+    )
+    v_expand = op.Expand(v_unsqueezed, v_expand_shape)
+
+    k_attention_shape = op.Concat(
+        batch_size, q_num_heads, new_kv_seq_len, qk_head_size, axis=0
+    )
+    v_attention_shape = op.Concat(
+        batch_size, q_num_heads, new_kv_seq_len, v_head_size, axis=0
+    )
+
+    expanded_key = op.Reshape(k_expand, k_attention_shape)
+    expanded_value = op.Reshape(v_expand, v_attention_shape)
+
+    return expanded_key, expanded_value
+
+
 def _attention_scale(query: TFloat, op: Opset) -> TFloat:
     """Calculate the scale factor for the attention result.
 
diff --git a/torch/onnx/_internal/torchscript_exporter/_globals.py b/torch/onnx/_internal/torchscript_exporter/_globals.py
index 55d0550324e73..9e27c1dbeb8ad 100644
--- a/torch/onnx/_internal/torchscript_exporter/_globals.py
+++ b/torch/onnx/_internal/torchscript_exporter/_globals.py
@@ -1,4 +1,3 @@
-# mypy: allow-untyped-defs
 """Globals used internally by the ONNX exporter.
 
 Do not use this module outside of `torch.onnx` and its tests.
@@ -34,12 +33,12 @@ def __init__(self) -> None:
         self._autograd_inlining: bool = True
 
     @property
-    def training_mode(self):
+    def training_mode(self) -> _C_onnx.TrainingMode:
         """The training mode for the exporter."""
         return self._training_mode
 
     @training_mode.setter
-    def training_mode(self, training_mode: _C_onnx.TrainingMode):
+    def training_mode(self, training_mode: _C_onnx.TrainingMode) -> None:
         if not isinstance(training_mode, _C_onnx.TrainingMode):
             raise TypeError(
                 "training_mode must be of type 'torch.onnx.TrainingMode'. This is "
@@ -53,7 +52,7 @@ def export_onnx_opset_version(self) -> int:
         return self._export_onnx_opset_version
 
     @export_onnx_opset_version.setter
-    def export_onnx_opset_version(self, value: int):
+    def export_onnx_opset_version(self, value: int) -> None:
         self._export_onnx_opset_version = value
 
     @property
@@ -62,7 +61,7 @@ def in_onnx_export(self) -> bool:
         return self._in_onnx_export
 
     @in_onnx_export.setter
-    def in_onnx_export(self, value: bool):
+    def in_onnx_export(self, value: bool) -> None:
         if type(value) is not bool:
             raise TypeError("in_onnx_export must be a boolean")
         self._in_onnx_export = value
@@ -73,7 +72,7 @@ def autograd_inlining(self) -> bool:
         return self._autograd_inlining
 
     @autograd_inlining.setter
-    def autograd_inlining(self, value: bool):
+    def autograd_inlining(self, value: bool) -> None:
         if type(value) is not bool:
             raise TypeError("autograd_inlining must be a boolean")
         self._autograd_inlining = value
diff --git a/torch/onnx/errors.py b/torch/onnx/errors.py
index 148223b447bd8..d5483dc67e3b1 100644
--- a/torch/onnx/errors.py
+++ b/torch/onnx/errors.py
@@ -44,7 +44,7 @@ def __init__(self, name: str, version: int, supported_version: int | None):
             )
         else:
             msg = (
-                "ONNX export failed on an operator with unrecognized namespace {op_name}. "
+                f"ONNX export failed on an operator with unrecognized namespace {name}. "
                 "If you are trying to export a custom operator, make sure you registered it with "
                 "the right domain and version."
             )
diff --git a/torch/onnx/ops/_impl.py b/torch/onnx/ops/_impl.py
index 30ffa9caf56d2..a7eba334ecfc8 100644
--- a/torch/onnx/ops/_impl.py
+++ b/torch/onnx/ops/_impl.py
@@ -56,18 +56,55 @@ def rotary_embedding_23(
     rotary_embedding_dim: int = 0,
 ) -> torch.Tensor:
     """RotaryEmbedding-23 https://onnx.ai/onnx/operators/onnx__RotaryEmbedding.html#rotaryembedding-23"""
+    # x has shape (batch_size, num_heads, sequence_length, head_size)
+    # or (batch_size, sequence_length, hidden_size)
+    input_shape = x.shape
+    input_rank = len(input_shape)
+    batch_size = input_shape[0]
+    sequence_length = input_shape[-2]
+
+    # Validate position_ids and caches match x
+    if position_ids is not None:
+        torch._check(
+            position_ids.dim() == 2,
+            lambda: f"position_ids must be 2D when provided. Received shape {position_ids.shape}",
+        )
+        torch._check(
+            position_ids.shape[0] == batch_size,
+            lambda: f"position_ids first dim (batch) must match x.shape[0] ({batch_size}). Received {position_ids.shape[0]}",
+        )
+        torch._check(
+            position_ids.shape[1] == sequence_length,
+            lambda: f"position_ids second dim (sequence) must match x.shape[-2] ({sequence_length}). Received {position_ids.shape[1]}",
+        )
+        torch._check(
+            cos_cache.dim() == 2 and sin_cache.dim() == 2,
+            lambda: "cos_cache/sin_cache must be 2D when position_ids is provided. "
+            f"Received cos_cache shape {cos_cache.shape}, sin_cache shape {sin_cache.shape}",
+        )
+    else:
+        torch._check(
+            cos_cache.dim() == 3 and sin_cache.dim() == 3,
+            lambda: "cos_cache/sin_cache must be 3D when position_ids is not provided. "
+            f"Received cos_cache shape {cos_cache.shape}, sin_cache shape {sin_cache.shape}",
+        )
+
     # First ensure x has shape [batch_size, num_heads, seq_len, head_size]
-    batch_size = x.shape[0]
-    sequence_length = x.shape[1]
-    if len(x.shape) == 3:
-        hidden_size = x.shape[2]
+    # So that the rotation logic can be shared with reshaped 3D inputs
+    if input_rank == 4:
+        # Reshape from (batch_size, num_heads, seq_len, head_size)
+        # to [batch_size, seq_len, num_heads, head_size]
+        x = torch.permute(x, (0, 2, 1, 3))
+    elif input_rank == 3:
         torch._check(
             num_heads != 0,
-            lambda: f"num_heads must be provided for 3D inputs. Received input tensor with shape {x.shape}",
+            lambda: f"num_heads must be provided for 3D inputs. Received input tensor with shape {input_shape}",
         )
+        hidden_size = input_shape[2]
         head_size = hidden_size // num_heads
         new_shape = [batch_size, sequence_length, num_heads, head_size]
         x = torch.reshape(x, new_shape)
+
     torch._check(len(x.shape) == 4, lambda: "x should be a 4D tensor by now")
     head_size = x.shape[3]
 
@@ -88,14 +125,25 @@ def rotary_embedding_23(
             position_ids
         ]  # Shape: [batch_size, sequence_length, head_size/2]
     else:
-        cos = cos_cache
-        sin = sin_cache
-    cos = cos[
-        :, :, :rotary_embedding_dim_half
-    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
-    sin = sin[
-        :, :, :rotary_embedding_dim_half
-    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+        cos = cos_cache  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+        sin = sin_cache  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+
+    torch._check(
+        cos.shape[0] == batch_size and cos.shape[1] == sequence_length,
+        lambda: f"cos has shape {cos.shape} but expected (batch={batch_size}, seq={sequence_length}, ...)",
+    )
+    torch._check(
+        sin.shape[0] == batch_size and sin.shape[1] == sequence_length,
+        lambda: f"sin has shape {sin.shape} but expected (batch={batch_size}, seq={sequence_length}, ...)",
+    )
+    torch._check(
+        cos.shape[-1] == rotary_embedding_dim_half,
+        lambda: f"Last dimension of cos cache ({cos.shape[-1]}) should match rotary_embedding_dim/2 ({rotary_embedding_dim_half}).",
+    )
+    torch._check(
+        sin.shape[-1] == rotary_embedding_dim_half,
+        lambda: f"Last dimension of sin cache ({sin.shape[-1]}) should match rotary_embedding_dim/2 ({rotary_embedding_dim_half}).",
+    )
     cos = torch.unsqueeze(
         cos, 2
     )  # Shape: [batch_size, sequence_length, 1, rotary_embedding_dim/2]
@@ -125,9 +173,11 @@ def rotary_embedding_23(
     else:
         x_rotate = torch.cat((real, imag), dim=-1)
     output = torch.cat((x_rotate, x_not_rotate), dim=-1)
-    if len(x.shape) == 3:
-        output = torch.reshape(output, x.shape)
-    return output
+    if input_rank == 3:
+        return torch.reshape(output, input_shape)
+
+    # Return the dimensions to the original order
+    return torch.permute(output, (0, 2, 1, 3))
 
 
 def _get_scale_factor(scale: Optional[float], head_size: int) -> float:
diff --git a/torch/onnx/testing.py b/torch/onnx/testing.py
new file mode 100644
index 0000000000000..aa168b32746f5
--- /dev/null
+++ b/torch/onnx/testing.py
@@ -0,0 +1,8 @@
+"""Utilities to aid in testing exported ONNX models."""
+
+__all__ = ["assert_onnx_program"]
+
+from torch.onnx._internal.exporter._testing import assert_onnx_program
+
+
+assert_onnx_program.__module__ = "torch.onnx.testing"
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index 674aaaf268835..09f5f2ca8c882 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -442,7 +442,14 @@ def obj_func(x, t, d):
                         return self._directional_evaluate(closure, x, t, d)
 
                     loss, flat_grad, t, ls_func_evals = _strong_wolfe(
-                        obj_func, x_init, t, d, loss, flat_grad, gtd
+                        obj_func,
+                        x_init,
+                        t,
+                        d,
+                        loss,
+                        flat_grad,
+                        gtd,
+                        max_ls=max_eval - current_evals,
                     )
                 self._add_grad(t, d)
                 opt_cond = flat_grad.abs().max() <= tolerance_grad
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 8703719dabc72..92288d0cbdfd4 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -79,6 +79,16 @@ def _copy(_param):
     return list(map(_copy, param))
 
 
+def _update_param_group_val(param_group: dict[str, Any], key: str, val: float | Tensor):
+    """Set param_group[key] to val without aliasing or assignment when they're both tensors.
+    Raises a KeyError if param_group[key] does not exist.
+    """
+    if isinstance(param_group[key], Tensor):
+        param_group[key].fill_(_to_scalar(val))
+    else:
+        param_group[key] = val
+
+
 class LRScheduler:
     r"""Adjusts the learning rate during optimization."""
 
@@ -106,8 +116,10 @@ def __init__(
             for i, group in enumerate(optimizer.param_groups):
                 if "initial_lr" not in group:
                     raise KeyError(
-                        "param 'initial_lr' is not specified "
-                        f"in param_groups[{i}] when resuming an optimizer"
+                        f"param 'initial_lr' is not specified in param_groups[{i}] when resuming scheduler with last_epoch >= 0.\n"
+                        "This typically happens when:\n"
+                        "1. You're trying to resume training from a checkpoint but haven't properly loaded the optimizer state\n"
+                        "2. You're using last_epoch >= 0 for a fresh training run (not recommended)"
                     )
         self.base_lrs: list[float] = [
             group["initial_lr"] for group in optimizer.param_groups
@@ -217,10 +229,7 @@ def _update_lr(self, epoch: Optional[int] = None):
                     values = self.get_lr()
 
         for param_group, lr in zip(self.optimizer.param_groups, values):
-            if isinstance(param_group["lr"], Tensor):
-                param_group["lr"].fill_(_to_scalar(lr))
-            else:
-                param_group["lr"] = lr
+            _update_param_group_val(param_group, "lr", lr)
 
         self._last_lr: list[float] = [
             group["lr"] for group in self.optimizer.param_groups
@@ -887,7 +896,7 @@ def __init__(
 
         # Reset learning rates back to initial values
         for group in self.optimizer.param_groups:
-            group["lr"] = group["initial_lr"]
+            _update_param_group_val(group, "lr", group["initial_lr"])
 
         # "Undo" the step performed by other schedulers
         self.recursive_undo()
@@ -1383,7 +1392,7 @@ def _reduce_lr(self, epoch):
             old_lr = float(param_group["lr"])
             new_lr = max(old_lr * self.factor, self.min_lrs[i])
             if old_lr - new_lr > self.eps:
-                param_group["lr"] = new_lr
+                _update_param_group_val(param_group, "lr", new_lr)
 
     @property
     def in_cooldown(self):  # noqa: D102
@@ -1858,7 +1867,7 @@ def step(self, epoch=None) -> None:
 
         with _enable_get_lr_call(self):
             for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-                param_group["lr"] = lr
+                _update_param_group_val(param_group, "lr", lr)
 
         self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
 
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index da4f005820c68..e15b796cdbe5c 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -7,6 +7,7 @@
 from collections.abc import Iterable
 from copy import deepcopy
 from typing import Any, Callable, cast, Literal, Optional, Union
+from typing_extensions import override
 
 import torch
 from torch import Tensor
@@ -431,10 +432,7 @@ def __init__(
                 "anneal_strategy must by one of 'cos' or 'linear', "
                 f"instead got {anneal_strategy}"
             )
-        elif anneal_strategy == "cos":
-            self.anneal_func = self._cosine_anneal
-        elif anneal_strategy == "linear":
-            self.anneal_func = self._linear_anneal
+        self._set_anneal_func(anneal_strategy)
         if not isinstance(anneal_epochs, int) or anneal_epochs < 0:
             raise ValueError(
                 f"anneal_epochs must be equal or greater than 0, got {anneal_epochs}"
@@ -482,3 +480,34 @@ def get_lr(self):
             group["swa_lr"] * alpha + lr * (1 - alpha)
             for group, lr in zip(self.optimizer.param_groups, prev_lrs)
         ]
+
+    def _set_anneal_func(self, anneal_strategy: Literal["cos", "linear"]):
+        self._anneal_strategy = anneal_strategy
+        if anneal_strategy == "cos":
+            self.anneal_func = self._cosine_anneal
+        else:
+            self.anneal_func = self._linear_anneal
+
+    @override
+    def state_dict(self) -> dict[str, Any]:
+        """Return the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer or anneal_func.
+        """
+        return {
+            key: value
+            for key, value in self.__dict__.items()
+            if key not in ("optimizer", "anneal_func")
+        }
+
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        """Load the scheduler's state.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+        self._set_anneal_func(self._anneal_strategy)
diff --git a/torch/overrides.py b/torch/overrides.py
index c8fd7c6a22899..8dc238d114b9d 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -362,7 +362,7 @@ def get_ignored_functions() -> set[Callable]:
         Tensor._view_func,
         Tensor._view_func_unsafe,
         Tensor._rev_view_func_unsafe,
-        Tensor._make_dtensor,
+        Tensor._dtensor__new__,
         Tensor._make_wrapper_subclass,
         Tensor._python_dispatch.__get__,
         Tensor._has_symbolic_sizes_strides.__get__,
@@ -1516,6 +1516,7 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         Tensor.zero_: lambda self: -1,
         Tensor.__dlpack__: lambda self, stream=None, max_version=None, dl_device=None, copy=None: -1,
         Tensor.__dlpack_device__: lambda self: -1,
+        Tensor.index: lambda self, a, b: -1,
         torch.linalg.lstsq: lambda self, b, cond=None, driver=None: -1,
     }  # fmt: skip
 
diff --git a/torch/sparse/_semi_structured_ops.py b/torch/sparse/_semi_structured_ops.py
index 0a4196b1f62ba..eed657550a7ec 100644
--- a/torch/sparse/_semi_structured_ops.py
+++ b/torch/sparse/_semi_structured_ops.py
@@ -40,7 +40,7 @@ def semi_sparse_values(func, types, args=(), kwargs=None) -> torch.Tensor:
     if A.meta is None:
         m, k = A.shape
         num_kept_elements = m * k // 2
-        return A.packed[:num_kept_elements:].view(m, -1)
+        return A.packed.ravel()[:num_kept_elements:].view(m, -1)
     else:
         return A.packed.detach()
 
@@ -53,7 +53,7 @@ def semi_sparse_indices(func, types, args=(), kwargs=None) -> torch.Tensor:
     if A.meta is None:
         m, k = A.shape
         num_kept_elements = m * k // 2
-        metadata = A.packed[num_kept_elements:].view(m, -1)
+        metadata = A.packed.ravel()[num_kept_elements:].view(m, -1)
         return metadata.view(torch.int32 if A.dtype == torch.int32 else torch.int16)
     else:
         return A.meta
diff --git a/torch/sparse/semi_structured.py b/torch/sparse/semi_structured.py
index b225eaabb3206..e081e15f96dfa 100644
--- a/torch/sparse/semi_structured.py
+++ b/torch/sparse/semi_structured.py
@@ -576,7 +576,7 @@ def prune_dense_static_sort(
         cls, original_tensor: torch.Tensor, algorithm=""
     ) -> "SparseSemiStructuredTensor":
         """
-        This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPASRELt metadata
+        This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPARSELt metadata
         layout and sparse matmul.
 
         The only functional difference is that cuSPARSELt stores `metadata` and `packed` together into a single tensor.
@@ -620,6 +620,11 @@ def prune_dense_static_sort(
             original_tensor, algorithm=algorithm, use_cutlass=False
         )
 
+        # Map this two 2-dim view of packed data.
+        # TODO: is this proper cuSPARSELt metadata?
+        packed = packed.view(original_tensor.shape[0], -1)
+        packed_t = packed_t.view(original_tensor.shape[1], -1)
+
         return cls(
             original_tensor.shape,
             packed=packed,
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index be284429114f5..846d2b407684c 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -181,9 +181,6 @@ def tf32_off():
 
 @contextlib.contextmanager
 def tf32_on(self, tf32_precision=1e-5):
-    if torch.version.hip:
-        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
     old_allow_tf32_matmul = torch.backends.cuda.matmul.allow_tf32
     old_precision = self.precision
     try:
@@ -192,11 +189,6 @@ def tf32_on(self, tf32_precision=1e-5):
         with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True):
             yield
     finally:
-        if torch.version.hip:
-            if hip_allow_tf32 is not None:
-                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-            else:
-                del os.environ["HIPBLASLT_ALLOW_TF32"]
         torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul
         self.precision = old_precision
 
@@ -246,7 +238,7 @@ def tf32_enabled():
 # if device is specified, it will check if device is cuda
 # if dtype is specified, it will check if dtype is float32 or complex64
 # tf32 and fp32 are different only when all the three checks pass
-def tf32_on_and_off(tf32_precision=1e-5, only_if=True):
+def tf32_on_and_off(tf32_precision=1e-5, *, only_if=True):
     def with_tf32_disabled(self, function_call):
         with tf32_off():
             function_call()
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 8971eca1bb24e..43c7741c69aab 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -1139,7 +1139,7 @@ def test_wrapper(*args, **kwargs):
                             tracked_input = get_tracked_input()
                             if PRINT_REPRO_ON_FAILURE and tracked_input is not None:
                                 e_tracked = Exception(  # noqa: TRY002
-                                    f"Caused by {tracked_input.type_desc} "
+                                    f"{str(e)}\n\nCaused by {tracked_input.type_desc} "
                                     f"at index {tracked_input.index}: "
                                     f"{_serialize_sample(tracked_input.val)}"
                                 )
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index c1f75697fe889..b445f4ad85351 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -1,6 +1,7 @@
 # mypy: ignore-errors
 
 import faulthandler
+import functools
 import itertools
 import logging
 import multiprocessing
@@ -36,6 +37,7 @@
     FILE_SCHEMA,
     find_free_port,
     IS_SANDCASTLE,
+    LazyVal,
     retry_on_connect_failures,
     skip_but_pass_in_sandcastle,
     skip_but_pass_in_sandcastle_if,
@@ -338,26 +340,17 @@ def requires_gloo():
 
 
 def requires_nccl_version(version, msg):
-    if TEST_CUDA:
-        if not c10d.is_nccl_available():
-            return skip_but_pass_in_sandcastle(
-                "c10d was not compiled with the NCCL backend",
-            )
-        else:
-            return skip_but_pass_in_sandcastle_if(
-                torch.cuda.nccl.version() < version,
-                f"Requires NCCL version greater than or equal to: {version}, found: {torch.cuda.nccl.version()}, reason: {msg}",
-            )
+    if not TEST_CUDA:
+        return lambda f: f
+    if not c10d.is_nccl_available():
+        return skip_but_pass_in_sandcastle(
+            "c10d was not compiled with the NCCL backend",
+        )
     else:
-
-        def decorator(func):
-            @wraps(func)
-            def wrapper(*args, **kwargs):
-                return func(*args, **kwargs)
-
-            return wrapper
-
-        return decorator
+        return skip_but_pass_in_sandcastle_if(
+            torch.cuda.nccl.version() < version,
+            f"Requires NCCL version greater than or equal to: {version}, found: {torch.cuda.nccl.version()}, reason: {msg}",
+        )
 
 
 def requires_nccl():
@@ -421,17 +414,64 @@ def requires_multicast_support():
     )
 
 
+def evaluate_platform_supports_symm_mem():
+    if TEST_CUDA:
+        if TEST_WITH_ROCM:
+            arch_list = ["gfx942", "gfx950"]
+            for arch in arch_list:
+                if arch in torch.cuda.get_device_properties(0).gcnArchName:
+                    return True
+            return False
+        else:
+            return True
+    else:
+        return False
+
+
+PLATFORM_SUPPORTS_SYMM_MEM: bool = LazyVal(
+    lambda: evaluate_platform_supports_symm_mem()
+)
+
+
 def skip_if_rocm_multiprocess(func):
-    """Skips a test for ROCm"""
-    func.skip_if_rocm_multiprocess = True
+    """Skips a test for ROCm multiprocess UTs"""
+    return unittest.skipIf(TEST_WITH_ROCM, TEST_SKIPS["skipIfRocm"].message)(func)
 
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if not TEST_WITH_ROCM:
-            return func(*args, **kwargs)
-        sys.exit(TEST_SKIPS["skipIfRocm"].exit_code)
 
-    return wrapper
+def skip_if_rocm_arch_multiprocess(arch: tuple[str, ...]):
+    """Skips a test for given ROCm archs - multiprocess UTs"""
+
+    def decorator(func):
+        prop = torch.cuda.get_device_properties(0).gcnArchName.split(":")[0]
+        arch_match = prop in arch
+        reason = None
+        if TEST_WITH_ROCM and arch_match:
+            reason = f"skip_if_rocm_arch_multiprocess: test skipped on {arch}"
+
+        return unittest.skipIf(reason is not None, reason)(func)
+
+    return decorator
+
+
+def skip_if_rocm_ver_lessthan_multiprocess(version=None):
+    """Skips a test for ROCm based on ROCm ver - multiprocess UTs"""
+
+    def decorator(func):
+        reason = None
+        if TEST_WITH_ROCM:
+            rocm_version = str(torch.version.hip)
+            rocm_version = rocm_version.split("-", maxsplit=1)[0]  # ignore git sha
+            rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
+            if (
+                rocm_version_tuple is None
+                or version is None
+                or rocm_version_tuple < tuple(version)
+            ):
+                reason = f"skip_if_rocm_ver_lessthan_multiprocess: ROCm {rocm_version_tuple} is available but {version} required"
+
+        return unittest.skipIf(reason is not None, reason)(func)
+
+    return decorator
 
 
 def skip_if_win32():
@@ -1101,30 +1141,24 @@ def run_subtests(
         c10d.barrier()
 
 
-# Cannot use functools.cache as it requires python 3.9
-EFA_PROBE_RESULT = None
-
-
+@functools.cache
 def has_efa() -> bool:
     """
     If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
     Libfabric EFA interfaces and EFA software components installed,
     see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html.
     """
-    global EFA_PROBE_RESULT
-    if EFA_PROBE_RESULT is not None:
-        return EFA_PROBE_RESULT
 
     try:
-        EFA_PROBE_RESULT = (
+        return (
             subprocess.run(
                 ["fi_info", "-p", "efa", "-t", "FI_EP_RDM"], check=False
             ).returncode
             == 0
         )
     except FileNotFoundError:
-        EFA_PROBE_RESULT = False
-    return EFA_PROBE_RESULT
+        pass
+    return False
 
 
 def tp_transports():
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 4c2c3e023031f..f81104cbf4da2 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -5905,6 +5905,7 @@ def sample_inputs_nn_pad(op_info, device, dtype, requires_grad, mode, **kwargs):
             ((1, 3), (1, 2)),
             ((1, 3), (0, 1)),
             ((1, 3), (0, 2, 0, 1)),
+            ((5, 3), (-1, -2, 1, 1)),
             ((0, 3, 3), (1, 2)),
             ((0, 3, 3), (0, 1)),
             ((0, 3, 3), (0, 2, 0, 1)),
@@ -8468,7 +8469,8 @@ def gen_shape_kwargs():
             yield make_input(s), make_target(s), dict(reduction=reduction)
             yield make_input(s), make_target(s), dict(weight=make_weight(), reduction=reduction)
             yield make_input(s), make_target(s), dict(weight=make_weight(low=0), reduction=reduction)
-            yield make_input(s), make_target(s), dict(weight=make_weight(high=0), reduction=reduction)
+            if dtype.is_floating_point or dtype.is_complex:
+                yield make_input(s), make_target(s), dict(weight=make_weight(high=0), reduction=reduction)
             t = make_target(s)
             ignore = num_classes // 2
             # If "mean", nll returns NaN, so it's not differentiable at those points
@@ -16593,12 +16595,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                 toleranceOverride({torch.float16: tol(atol=1e-04, rtol=0.001)}), 'TestUnaryUfuncs', device_type='cuda',), ],
         skips=[
             # still want to test that first derivative works though second derivative isn't supported
-            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', "test_inplace_gradgrad"),
-            # produces 0 instead of nan on ROCM
-            DecorateInfo(unittest.expectedFailure,
-                         'TestUnaryUfuncs', "test_reference_numerics_extremal",
-                         device_type='cuda',
-                         active_if=(TEST_WITH_ROCM)), ]
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', "test_inplace_gradgrad")]
     ),
     UnaryUfuncInfo(
         'nn.functional.logsigmoid',
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index edb897b6f99a5..981bbac213b67 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -16,7 +16,7 @@
     floating_types, floating_and_complex_types_and, get_all_fp_dtypes)
 from torch.testing._internal.common_device_type import (
     _TestParametrizer, _update_param_kwargs, expectedFailureMPS, toleranceOverride, tol,
-    skipCUDAIfRocm, precisionOverride, skipMeta, skipMPS)
+    precisionOverride, skipMeta, skipMPS)
 from torch.testing._internal.common_methods_invocations import DecorateInfo
 from torch.testing._internal.common_nn import (
     cosineembeddingloss_reference, cross_entropy_loss_reference, ctcloss_reference,
@@ -24,7 +24,7 @@
     marginrankingloss_reference, multimarginloss_reference, multilabelmarginloss_reference,
     nllloss_reference, nlllossNd_reference, smoothl1loss_reference, softmarginloss_reference, get_reduction)
 from torch.testing._internal.common_utils import (
-    freeze_rng_state, skipIfMPS, skipIfMPSOnMacOS13, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS,
+    freeze_rng_state, skipIfMPS, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS,
     skipIfTorchDynamo)
 from types import ModuleType
 import operator
@@ -3413,11 +3413,8 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                        'TestModule',
                        'test_memory_format',
                        active_if=operator.itemgetter('training'),
-                       device_type='cuda',
-                   ),
-                   # error: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
-                   DecorateInfo(skipIfMPSOnMacOS13, 'TestModule', dtypes=[torch.float16], device_type='mps',),),
-               ),
+                       device_type='cuda',),
+               ),),
     ModuleInfo(torch.nn.AvgPool3d,
                module_inputs_func=module_inputs_torch_nn_AvgPool3d,
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
@@ -3496,16 +3493,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=False),
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
-               skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
-               ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
                )),
@@ -3514,8 +3501,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # This was wrongly being skipped before and needs investigation.
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
@@ -3523,12 +3508,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='mps', dtypes=[torch.float32, torch.float16]),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3538,8 +3517,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Conv3d is not supported on MPS backend
                    DecorateInfo(skipMPS, device_type="mps"),
                    # This was wrongly being skipped before and needs investigation.
@@ -3555,17 +3532,10 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_memformat_affects_out=True,
                dtypes=floating_and_complex_types_and(torch.chalf),
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Not implemented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),),
+               ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
                    DecorateInfo(precisionOverride({torch.chalf: 5e-03}), 'TestModule', 'test_memory_format'),
@@ -3576,8 +3546,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_memformat_affects_out=True,
                dtypes=floating_and_complex_types_and(torch.chalf),
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Fails on backward check because ViewAsRealBackward apply contiguous for grad
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_memory_format',
                                 dtypes=(torch.complex32, torch.complex64, torch.complex128)),
@@ -3591,12 +3559,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Not implemented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3608,16 +3570,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # ConvTranspose3d is not supported on MPS backend
                    DecorateInfo(skipMPS),
                    # This was wrongly being skipped before and needs investigation.
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format"),
-                   # These fail only on ROCm
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
-                                dtypes=[torch.complex32, torch.complex64], active_if=TEST_WITH_ROCM),
                    # Not implemented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
@@ -3677,17 +3634,9 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3697,8 +3646,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
@@ -3709,12 +3656,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='mps', dtypes=[torch.float32, torch.float16]),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3724,8 +3665,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
@@ -3743,17 +3682,9 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3763,8 +3694,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
@@ -3775,12 +3704,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='mps', dtypes=[torch.float32, torch.float16]),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3790,8 +3713,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   # Failure on ROCM for float32 issue #70125
-                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
@@ -3880,9 +3801,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   # See #119108: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
-                   DecorateInfo(skipIfMPSOnMacOS13, 'TestModule', 'test_non_contiguous_tensors',
-                                device_type='mps', dtypes=[torch.float16],),
                    # See #119108: tolerance issue
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
                                 device_type='mps', dtypes=[torch.float16]),)
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
index ea07fd3c05143..2b4764ed0610b 100644
--- a/torch/testing/_internal/common_mps.py
+++ b/torch/testing/_internal/common_mps.py
@@ -348,7 +348,6 @@ def mps_ops_modifier(
             "nn.functional.interpolatearea": None,
             "nn.functional.interpolatebicubic": [torch.uint8],
             "nn.functional.ctc_loss": None,
-            "nn.functional.embedding_bag": None,
             "nn.functional.multi_margin_loss": None,
             "nn.functional.multilabel_margin_loss": None,
             "nn.functional.pdist": None,
@@ -740,6 +739,7 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "equal": [torch.float16, torch.float32],
             # 'float' object is not iterable
             "item": [torch.float16, torch.float32],
+            "nn.functional.embedding_bag": None,
             # "smooth_l1_backward_cpu_out" not implemented for 'Half'
             "nn.functional.smooth_l1_loss": [torch.float16],
             # cpu error: grad requires non-empty inputs
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
index 1e4368380bb59..aa99dc4022dd9 100644
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@@ -2178,16 +2178,6 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 "TestOptimRenewed",
                 "test_complex_2d",
             ),
-            DecorateInfo(
-                toleranceOverride(
-                    {  # previously atol=5-05, rtol=0.001, https://github.com/pytorch/pytorch/issues/116202
-                        torch.float32: tol(atol=5e-04, rtol=0.007),
-                    }
-                ),
-                "TestOptimRenewed",
-                "test_mixed_device_dtype",
-                active_if=TEST_WITH_TORCHDYNAMO,
-            ),
             DecorateInfo(
                 skipIfTorchDynamo(
                     "This test uses mocks, which dynamo does not support"
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 600848b80a7e8..8040d647216f3 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -58,7 +58,7 @@
     XNNPACKQuantizer,
 )
 
-from torch.export import export_for_training
+from torch.export import export
 from torch.jit.mobile import _load_for_lite_interpreter
 from torch.testing._internal.common_quantized import override_quantized_engine
 from torch.testing._internal.common_utils import TEST_WITH_ROCM, TestCase
@@ -1513,7 +1513,7 @@ def _test_quantizer(
             {0: torch.export.Dim("dim")} if i == 0 else None
             for i in range(len(example_inputs))
         )
-        m = export_for_training(
+        m = export(
             m,
             example_inputs,
             dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
@@ -1554,7 +1554,7 @@ def _test_quantizer(
             m_fx = _convert_to_reference_decomposed_fx(
                 m_fx, backend_config=backend_config
             )
-            m_fx = export_for_training(
+            m_fx = export(
                 m_fx,
                 example_inputs,
                 dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
@@ -1578,7 +1578,7 @@ def _quantize(self, m, quantizer, example_inputs, is_qat: bool = False):
         # resetting dynamo cache
         torch._dynamo.reset()
 
-        m = export_for_training(m, example_inputs, strict=True).module()
+        m = export(m, example_inputs, strict=True).module()
         if is_qat:
             m = prepare_qat_pt2e(m, quantizer)
         else:
@@ -3183,12 +3183,15 @@ def forward(self, x):
             x = self.adaptive_avg_pool2d(x)
             return x
 
-
     class ConvWithBNRelu(torch.nn.Module):
         def __init__(self, relu, dim=2, bn=True, bias=True, padding=0):
             super().__init__()
             convs = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
-            bns = {1: torch.nn.BatchNorm1d, 2: torch.nn.BatchNorm2d, 3: torch.nn.BatchNorm3d}
+            bns = {
+                1: torch.nn.BatchNorm1d,
+                2: torch.nn.BatchNorm2d,
+                3: torch.nn.BatchNorm3d,
+            }
             self.conv = convs[dim](3, 3, 3, bias=bias, padding=padding)
 
             if bn:
@@ -3394,7 +3397,7 @@ def get_default_quantizer(is_qat, is_dynamic, inputs):
 
     maybe_no_grad = contextlib.nullcontext() if is_qat else torch.no_grad()
     with maybe_no_grad:
-        export_model = export_for_training(mod, inputs, strict=True).module(check_guards=False)
+        export_model = export(mod, inputs, strict=True).module(check_guards=False)
         quantizer = (
             quantizer
             if quantizer
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index ad6f45d9049ea..718671ae87d30 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -69,7 +69,6 @@
 import torch.cuda
 from torch import Tensor
 from torch._C import ScriptDict, ScriptList  # type: ignore[attr-defined]
-from torch._dynamo.trace_rules import _as_posix_path
 from torch._utils_internal import get_writable_path
 from torch._logging.scribe import open_source_signpost
 from torch.nn import (
@@ -104,6 +103,7 @@
 
 
 MI300_ARCH = ("gfx940", "gfx941", "gfx942")
+MI200_ARCH = ("gfx90a")
 NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
 NAVI3_ARCH = ("gfx1100", "gfx1101")
 NAVI4_ARCH = ("gfx1200", "gfx1201")
@@ -2027,16 +2027,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-def skipIfMPSOnMacOS13(fn):
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        if TEST_MPS and int(MACOS_VERSION) == 13:
-            raise unittest.SkipTest("Test crashes MPSGraph on MacOS13")
-        else:
-            fn(*args, **kwargs)
-    return wrapper
-
-
 def skipIfHpu(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -5642,6 +5632,8 @@ class LazyVal:
 
 
 def munge_exc(e, *, suppress_suffix=True, suppress_prefix=True, file=None, skip=0):
+    from torch._dynamo.trace_rules import _as_posix_path
+
     if file is None:
         file = inspect.stack()[1 + skip].filename  # skip one frame
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index e25e08fbf5090..e0fb684ae725d 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -13,7 +13,6 @@
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
-from torch._utils import _get_device_module
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_tensor,
@@ -38,24 +37,21 @@
     skip_if_lt_x_gpu,
     TEST_SKIPS,
 )
-from torch.testing._internal.common_utils import TEST_CUDA, TEST_HPU, TEST_XPU
+from torch.testing._internal.common_utils import (
+    TEST_CUDA,
+    TEST_HPU,
+    TEST_PRIVATEUSE1,
+    TEST_XPU,
+)
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
 
 
 DEVICE_COUNT: int
 
-if TEST_CUDA:
-    DEVICE_TYPE = "cuda"
-    PG_BACKEND = "nccl"
-    DEVICE_COUNT = _get_device_module("cuda").device_count()
-elif TEST_HPU:
-    DEVICE_TYPE = "hpu"
-    PG_BACKEND = "hccl"
-    DEVICE_COUNT = _get_device_module("hpu").device_count()
-elif TEST_XPU:
-    DEVICE_TYPE = "xpu"
-    PG_BACKEND = "xccl"
-    DEVICE_COUNT = _get_device_module("xpu").device_count()
+if TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1:
+    DEVICE_TYPE = torch.accelerator.current_accelerator().type
+    DEVICE_COUNT = torch.accelerator.device_count()
+    PG_BACKEND = dist.Backend.default_device_backend_map[DEVICE_TYPE]
 else:
     DEVICE_TYPE = "cpu"
     PG_BACKEND = "gloo"
@@ -63,7 +59,7 @@
 NUM_DEVICES = 4
 
 # We use this as a proxy for "multiple GPUs exist"
-if (TEST_CUDA or TEST_XPU or TEST_HPU) and DEVICE_COUNT > 1:
+if (TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1) and DEVICE_COUNT > 1:
     # when we actually have multiple GPUs, relax the requirement to smaller counts.
     NUM_DEVICES = min(NUM_DEVICES, DEVICE_COUNT)
 
@@ -341,7 +337,10 @@ class DTensorContinuousTestBase(MultiProcContinuousTest):
     @classmethod
     def device_type(cls) -> str:
         # if enough GPU/XPU/HPU we can use those devices, otherwise we fallback to CPU
-        if not (TEST_CUDA or TEST_XPU or TEST_HPU) or DEVICE_COUNT < cls.world_size:
+        if (
+            not (TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1)
+            or DEVICE_COUNT < cls.world_size
+        ):
             return "cpu"
         else:
             return DEVICE_TYPE
@@ -360,7 +359,10 @@ def world_size(self) -> int:
     @property
     def device_type(self) -> str:
         # if enough GPU/XPU/HPU we can use those devices, otherwise we fallback to CPU
-        if not (TEST_CUDA or TEST_XPU or TEST_HPU) or DEVICE_COUNT < self.world_size:
+        if (
+            not (TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1)
+            or DEVICE_COUNT < self.world_size
+        ):
             return "cpu"
         else:
             return DEVICE_TYPE
@@ -388,6 +390,7 @@ def init_pg(self, eager_init, backend: Optional[str] = None) -> None:
             "hccl",
             "xccl",
             "fake",
+            "cpu:gloo,xpu:xccl",
         ]:
             raise RuntimeError(f"Backend {backend} not supported!")
 
diff --git a/torch/testing/_internal/distributed/fake_pg.py b/torch/testing/_internal/distributed/fake_pg.py
index 035a8bb7c586d..a36d2da29b4a0 100644
--- a/torch/testing/_internal/distributed/fake_pg.py
+++ b/torch/testing/_internal/distributed/fake_pg.py
@@ -28,5 +28,5 @@ def _create_fake_pg(common_opts, backend_opts):
 
 
 dist.Backend.register_backend(
-    "fake", _create_fake_pg, extended_api=True, devices=["cpu", "cuda", "hpu"]
+    "fake", _create_fake_pg, extended_api=True, devices=["cpu", "cuda", "hpu", "xpu"]
 )
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 8de13414dd475..1f5d1ef1bdbd3 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -71,6 +71,24 @@ def bitwise_reduce(tensors, op):
 }
 
 
+# Note [Hide collectives mutation from autograd]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Threaded PG is intended to closely simulate the behavior of regular process
+# groups.  However, our regular PG implementations perform a dispatch through
+# c10d, whereas Threaded PG does not for some reason (some superficial
+# but not very convincing reasons include that Threaded PG is implemented
+# in Python but you can't override Backend in Python, you can only override
+# ProcessGroup in Python), thereby bypassing the dispatch step.  Now we have
+# a problem: c10d's signatures are LIES, they mutate their (output) tensor
+# arguments but their annotations don't have mutations on them so we don't
+# actually update any view metadata if you do differentiation.  This
+# ordinarily "doesn't matter" because distributed collectives aren't
+# differentiable anyway, but it's possible to tickle this in testing if
+# someone tries to touch the grad_fn of a Tensor.  There a few ways to
+# fix this, but the easiest way was to use the .detach() trick to hide
+# the mutations from autograd.
+
+
 class AllToAll:
     @torch.no_grad()
     def work(self, data):
@@ -79,7 +97,10 @@ def work(self, data):
             output_tensor_list, _ = data[dest_rank]
             for src_rank in range(world_size):
                 _, input_tensor_list = data[src_rank]
-                output_tensor_list[src_rank].copy_(input_tensor_list[dest_rank])
+                # See Note [Hide collectives mutation from autograd]
+                output_tensor_list[src_rank].detach().copy_(
+                    input_tensor_list[dest_rank]
+                )
 
 
 class AllToAllBase:
@@ -99,9 +120,10 @@ def work(self, data):
                     input_buffer.size(0), input_split_sizes, world_size
                 )
 
+                # See Note [Hide collectives mutation from autograd]
                 output_buffer[
                     output_indexes[src_rank] : output_indexes[src_rank + 1]
-                ].copy_(
+                ].detach().copy_(
                     input_buffer[
                         input_indexes[dest_rank] : input_indexes[dest_rank + 1]
                     ]
@@ -152,7 +174,8 @@ def work(self, data):
 
             # copy all the reduced value to each rank
             for src_rank in range(len(data)):
-                data[src_rank][i].copy_(res.to(data[src_rank][i].device))
+                # See Note [Hide collectives mutation from autograd]
+                data[src_rank][i].detach().copy_(res.to(data[src_rank][i].device))
 
 
 class AllGather:
@@ -166,7 +189,8 @@ def work(self, data):
 
             for dest in data:
                 dest_tensor = dest[0][0][src_rank]
-                dest_tensor.copy_(src_tensor)
+                # See Note [Hide collectives mutation from autograd]
+                dest_tensor.detach().copy_(src_tensor)
 
 
 class Scatter:
@@ -185,7 +209,8 @@ def work(self, data):
             # Can't handle scatter with multiple output tensor
             assert len(out_tensor_list) == 1
             dest_tensor = out_tensor_list[0]
-            dest_tensor.copy_(src_in_tensors[rank])
+            # See Note [Hide collectives mutation from autograd]
+            dest_tensor.detach().copy_(src_in_tensors[rank])
 
 
 class Gather:
@@ -202,7 +227,8 @@ def work(self, data):
             # Can't handle gather with multiple tensor lists
             assert len(src_in_tensor_list) == 1
             dest_tensor = out_tensor_list[rank]
-            dest_tensor.copy_(src_in_tensor_list[0])
+            # See Note [Hide collectives mutation from autograd]
+            dest_tensor.detach().copy_(src_in_tensor_list[0])
 
 
 class ReduceScatter:
@@ -224,14 +250,21 @@ def work(self, data):
                 assert len(dest_tensor_on_rank_i) == 1
                 dst_tensor_device = dest_tensor_on_rank_i[0].device
                 if not start_reduction[i]:
-                    dest_tensor_on_rank_i[0].copy_(to_scatter[i].to(dst_tensor_device))
+                    # See Note [Hide collectives mutation from autograd]
+                    dest_tensor_on_rank_i[0].detach().copy_(
+                        to_scatter[i].to(dst_tensor_device)
+                    )
                     start_reduction[i] = True
                 else:
-                    dest_tensor_on_rank_i[0].add_(to_scatter[i].to(dst_tensor_device))
+                    # See Note [Hide collectives mutation from autograd]
+                    dest_tensor_on_rank_i[0].detach().add_(
+                        to_scatter[i].to(dst_tensor_device)
+                    )
         if self.op == dist.ReduceOp.AVG:
             num_ranks = len(data)
             for each_rank_data in data:
-                each_rank_data[0][0] /= num_ranks
+                # See Note [Hide collectives mutation from autograd]
+                each_rank_data[0][0].detach().div_(num_ranks)
 
 
 class Broadcast:
@@ -242,9 +275,12 @@ def __init__(self, src):
     def work(self, data):
         in_tensor_list = flatten_list(data[self.src])
         for i in range(len(data)):
+            if i == self.src:
+                continue
             out_tensor_list = flatten_list(data[i])
             for j in range(len(in_tensor_list)):
-                out_tensor_list[j].copy_(in_tensor_list[j])
+                # See Note [Hide collectives mutation from autograd]
+                out_tensor_list[j].detach().copy_(in_tensor_list[j])
 
 
 class Collective:
diff --git a/torch/testing/_internal/hop_db.py b/torch/testing/_internal/hop_db.py
index 2a0883408892f..fc6cfa8cf7f4e 100644
--- a/torch/testing/_internal/hop_db.py
+++ b/torch/testing/_internal/hop_db.py
@@ -212,6 +212,32 @@ def body_fn(iter_t, x):
     return torch._higher_order_ops.while_loop_stack_output(cond_fn, body_fn, (iter_t, x), tuple())
 
 
+def sample_inputs_local_map_hop(opinfo, device, dtype, requires_grad, **kwargs):
+    # TODO: once HOPs support DTensor inputs, we should also test DTensors
+    make_arg = functools.partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=False
+    )
+    yield SampleInput(
+        make_arg(2, 3, 4, low=0.1, high=2),
+        make_arg(2, 3, 4, low=0.1, high=2),
+    )
+
+
+def simple_local_map_hop(inp1, inp2):
+    def body_gm(inp1, inp2):
+        return inp1.cos() + inp2.sin()
+    gm = torch.fx.symbolic_trace(body_gm)
+
+    assert torch.distributed.is_available()
+    from torch.distributed.tensor.placement_types import Replicate
+    gm.meta["local_map_kwargs"] = {
+        "in_placements": (Replicate(), Replicate(), Replicate()),
+        "out_placements": ((Replicate(), Replicate(), Replicate()),)
+    }
+
+    # TODO: Dynamo would rewrite this op differently
+    return torch._higher_order_ops.local_map_hop(gm, inp1, inp2)
+
 def sample_inputs_scan(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(
         make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
@@ -451,4 +477,25 @@ def fn(x):
         ),
         decorators=[onlyCUDA],
     ),
+    OpInfo(
+        name="local_map_hop",
+        variant_test_name="simple",
+        op=simple_local_map_hop,
+        sample_inputs_func=sample_inputs_local_map_hop,
+        dtypes=custom_types(torch.float16, torch.float32),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_aot_export"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestHOP", "test_pre_dispatch_export"
+            ),
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_serialize_export"),
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_retrace_export"),
+        ),
+        decorators=[onlyCUDA, unittest.skipIf(not torch.distributed.is_available(), "requires distributed build")],
+    ),
 ]
diff --git a/torch/testing/_internal/optests/make_fx.py b/torch/testing/_internal/optests/make_fx.py
index 83cefd18bc059..970a0be1b3695 100644
--- a/torch/testing/_internal/optests/make_fx.py
+++ b/torch/testing/_internal/optests/make_fx.py
@@ -55,7 +55,7 @@ def run(f, *args, **kwargs):
 # If any argument is a torch.Size(), maybe get dynamic shapes for it by:
 # - Create a temporary Tensor whose size is the torch.Size() we want. Note that
 #   we use an expanded Tensor as we cannot pass "meta" Tensors to make_fx.
-# - Pass it to make_fx such that it is is converted to a proxy Tensor
+# - Pass it to make_fx such that it is converted to a proxy Tensor
 # - Unpack the size in the wrapper to get a torch.Size with dynamic shapes (in
 #   symbolic mode, a no-op otherwise)
 def handle_sizes_for_dynamic_shapes(func, args, kwargs):
diff --git a/torch/utils/_config_module.py b/torch/utils/_config_module.py
index 811b45fd1d697..57c67aeab8f75 100644
--- a/torch/utils/_config_module.py
+++ b/torch/utils/_config_module.py
@@ -6,7 +6,6 @@
 import io
 import os
 import pickle
-import sys
 import tokenize
 import unittest
 from dataclasses import dataclass
@@ -38,7 +37,7 @@
 _UNSET_SENTINEL = object()
 
 
-@dataclass
+@dataclass(kw_only=True)
 class _Config(Generic[T]):
     """Represents a config with richer behaviour than just a default value.
     ::
@@ -82,32 +81,23 @@ class _Config(Generic[T]):
     justknob: Optional[str] = None
     env_name_default: Optional[list[str]] = None
     env_name_force: Optional[list[str]] = None
+    value_type: Optional[type] = None
     alias: Optional[str] = None
 
-    def __init__(
-        self,
-        default: Union[T, object] = _UNSET_SENTINEL,
-        justknob: Optional[str] = None,
-        env_name_default: Optional[Union[str, list[str]]] = None,
-        env_name_force: Optional[Union[str, list[str]]] = None,
-        value_type: Optional[type] = None,
-        alias: Optional[str] = None,
-    ):
-        # python 3.9 does not support kw_only on the dataclass :(.
-        self.default = default
-        self.justknob = justknob
+    def __post_init__(self) -> None:
         self.env_name_default = _Config.string_or_list_of_string_to_list(
-            env_name_default
+            self.env_name_default
+        )
+        self.env_name_force = _Config.string_or_list_of_string_to_list(
+            self.env_name_force
         )
-        self.env_name_force = _Config.string_or_list_of_string_to_list(env_name_force)
-        self.value_type = value_type
-        self.alias = alias
+
         if self.alias is not None:
             assert (
-                default is _UNSET_SENTINEL
-                and justknob is None
-                and env_name_default is None
-                and env_name_force is None
+                self.default is _UNSET_SENTINEL
+                and self.justknob is None
+                and self.env_name_default is None
+                and self.env_name_force is None
             ), "if alias is set, none of {default, justknob and env var} can be set"
 
     @staticmethod
@@ -148,7 +138,12 @@ def Config(
         alias: Optional[str] = None,
     ) -> _Config[T]:
         return _Config(
-            default, justknob, env_name_default, env_name_force, value_type, alias
+            default=default,
+            justknob=justknob,
+            env_name_default=env_name_default,
+            env_name_force=env_name_force,
+            value_type=value_type,
+            alias=alias,
         )
 
 
@@ -178,10 +173,7 @@ def visit(
         prefix: str,
     ) -> None:
         """Walk the module structure and move everything to module._config"""
-        if sys.version_info[:2] < (3, 10):
-            type_hints = getattr(source, "__annotations__", {})
-        else:
-            type_hints = inspect.get_annotations(source)
+        type_hints = inspect.get_annotations(source)
         for key, value in list(source.__dict__.items()):
             if (
                 key.startswith("__")
diff --git a/torch/utils/_cxx_pytree.py b/torch/utils/_cxx_pytree.py
index efe140f10f014..f5bf889071193 100644
--- a/torch/utils/_cxx_pytree.py
+++ b/torch/utils/_cxx_pytree.py
@@ -13,7 +13,6 @@
 """
 
 import functools
-import sys
 import types
 from collections.abc import Iterable
 from typing import Any, Callable, Optional, overload, TypeVar, Union
@@ -570,10 +569,7 @@ def tree_map_(
 
 Type2 = tuple[type[T], type[S]]
 Type3 = tuple[type[T], type[S], type[U]]
-if sys.version_info >= (3, 10):
-    TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
-else:
-    TypeAny = Union[type[Any], tuple[type[Any], ...]]
+TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
 
 Fn2 = Callable[[Union[T, S]], R]
 Fn3 = Callable[[Union[T, S, U]], R]
@@ -631,10 +627,7 @@ def go(t):
 
     You can also directly use 'tree_map_only'
     """
-    if isinstance(type_or_types_or_pred, (type, tuple)) or (
-        sys.version_info >= (3, 10)
-        and isinstance(type_or_types_or_pred, types.UnionType)
-    ):
+    if isinstance(type_or_types_or_pred, (type, tuple, types.UnionType)):
 
         def pred(x: Any) -> bool:
             return isinstance(x, type_or_types_or_pred)  # type: ignore[arg-type]
diff --git a/torch/utils/_debug_mode.py b/torch/utils/_debug_mode.py
new file mode 100644
index 0000000000000..a73f5f2f27410
--- /dev/null
+++ b/torch/utils/_debug_mode.py
@@ -0,0 +1,184 @@
+# mypy: allow-untyped-defs
+import contextlib
+from typing import Optional
+
+import torch
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.utils._dtype_abbrs import dtype_abbrs
+from torch.utils._python_dispatch import (
+    _get_current_dispatch_mode,
+    _get_current_dispatch_mode_stack,
+    TorchDispatchMode,
+)
+from torch.utils._pytree import tree_map
+
+
+__all__ = ["DebugMode", "get_active_debug_mode"]
+
+REDISTRIBUTE_FUNC = "redistribute_input"
+
+
+def _stringify_shape(shape) -> str:
+    return f"[{', '.join([str(x) for x in shape])}]"
+
+
+def _stringify_device_mesh(mesh) -> str:
+    return f"DM({', '.join([str(s) for s in mesh.shape])})"
+
+
+def _stringify_placement(placement) -> str:
+    return f"[{', '.join([str(p) for p in placement])}]"
+
+
+def _tensor_debug_string(tensor) -> str:
+    """Convert tensor to debug string representation."""
+    if isinstance(tensor, torch.distributed.tensor.DTensor):
+        # omitted device mesh
+        return f"dt: {dtype_abbrs[tensor.dtype]}{_stringify_shape(tensor.shape)}{_stringify_placement(tensor.placements)}"
+    elif isinstance(tensor, FakeTensor):
+        return f"ft: {dtype_abbrs[tensor.dtype]}{_stringify_shape(tensor.shape)}"
+    elif isinstance(tensor, torch.Tensor):
+        return f"t: {dtype_abbrs[tensor.dtype]}{_stringify_shape(tensor.shape)}"
+    else:
+        raise RuntimeError(f"Unsupported tensor type: {type(tensor)}")
+
+
+def _arg_to_str(arg) -> str:
+    from torch.distributed.tensor._dtensor_spec import DTensorSpec
+
+    def to_str(x):
+        if isinstance(x, torch.Tensor):
+            return _tensor_debug_string(x)
+        elif isinstance(x, DTensorSpec):
+            return _stringify_placement(x.placements)
+        return x
+
+    arg = tree_map(to_str, arg)
+    return str(arg)
+
+
+def _op_to_str(op, *args, **kwargs) -> str:
+    if op == REDISTRIBUTE_FUNC:
+        assert len(args) == 3
+        _args = [_arg_to_str(arg) for arg in args]
+        args_str = f"{_args[0]}, {_args[1]} -> {_args[2]}"
+    else:
+        args_str = ", ".join(_arg_to_str(arg) for arg in args)
+
+    if kwargs:
+        kwargs_str = ", " + ", ".join(
+            f"{k}={_arg_to_str(v)}" for k, v in kwargs.items()
+        )
+    else:
+        kwargs_str = ""
+
+    if isinstance(op, torch._ops.OpOverload):
+        op_name = op.__qualname__
+    elif hasattr(op, "__module__") and hasattr(op, "__name__"):
+        op_name = f"{op.__module__}.{op.__name__}"
+    else:
+        op_name = str(op)
+
+    return f"{op_name}({args_str}{kwargs_str})"
+
+
+class DebugMode(TorchDispatchMode):
+    def __init__(
+        self,
+        *,
+        record_torchfunction=False,
+        record_faketensor=False,
+        record_realtensor=True,
+    ):
+        super().__init__()
+        import torch.distributed.tensor  # noqa: F401
+
+        self.record_torchfunction = record_torchfunction
+        self.record_faketensor = record_faketensor
+        self.record_realtensor = record_realtensor
+
+        self.operators = []
+        self.call_depth = 0
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        self.operators.append((func, args, kwargs, self.call_depth))
+
+        try:
+            self.call_depth += 1
+            return func(*args, **kwargs)
+        finally:
+            self.call_depth -= 1
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        # Record the operation with its call depth
+        if torch.distributed.tensor.DTensor in types:
+            self.operators.append((func, args, kwargs, self.call_depth))
+            return NotImplemented
+        elif FakeTensor in types or isinstance(
+            _get_current_dispatch_mode(), FakeTensorMode
+        ):
+            if self.record_faketensor:
+                if func not in {torch.ops.prim.device.default}:
+                    self.operators.append((func, args, kwargs, self.call_depth + 1))
+        elif len(types) == 0:
+            if self.record_realtensor:
+                self.operators.append((func, args, kwargs, self.call_depth + 1))
+
+        result = func(*args, **kwargs)
+
+        return result
+
+    def __enter__(self):
+        self.operators = []
+        self.call_depth = 0
+
+        if self.record_torchfunction:
+            torch._C._push_on_torch_function_stack(self)
+
+        super().__enter__()
+        return self
+
+    def __exit__(self, *args):
+        super().__exit__(*args)
+        if self.record_torchfunction:
+            torch._C._pop_torch_function_stack()
+
+    @contextlib.contextmanager
+    def record_redistribute_calls(self, arg_idx, src_placement, dst_placement):
+        try:
+            self.operators.append(
+                (
+                    REDISTRIBUTE_FUNC,
+                    [arg_idx, src_placement, dst_placement],
+                    {},
+                    self.call_depth + 1,
+                )
+            )
+            self.call_depth += 1
+            yield
+        finally:
+            self.call_depth -= 1
+
+    def debug_string(self) -> str:
+        with torch._C.DisableTorchFunction():
+            result = ""
+            result += "\n".join(
+                "  " + "  " * depth + _op_to_str(op, *args, **kwargs)
+                for op, args, kwargs, depth in self.operators
+            )
+        return result
+
+
+def get_active_debug_mode() -> Optional[DebugMode]:
+    debug_mode = None
+    for mode in _get_current_dispatch_mode_stack():
+        if isinstance(mode, DebugMode):
+            debug_mode = mode
+            break
+    return debug_mode
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index 5441468eb3b5f..520bc8e3ece16 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -32,7 +32,7 @@
 _is_in_any_mode_without_ignore_compile_internals = False
 
 
-def is_in_torch_dispatch_mode(include_infra_modes=True) -> bool:
+def is_in_torch_dispatch_mode(include_infra_modes: bool = True) -> bool:
     return (
         _is_in_torch_dispatch_mode
         if include_infra_modes
@@ -71,7 +71,7 @@ class TorchDispatchMode:
     the next mode on the mode stack.  If you want recursively call back into
     your current ``__torch_dispatch__`` implementation, either explicitly
     invoke ``self.__torch_dispatch__(...)``, or use the context manager
-    ``__torch_dispatch__(self)`` to make PyTorch
+    ``self`` to make PyTorch
     API self-referential (beware of infinite loops, in this case!)
     """
 
@@ -200,9 +200,12 @@ def f(x):
         return False
 
 
-def _get_current_dispatch_mode():
+def _get_current_dispatch_mode() -> Optional[TorchDispatchMode]:
+    """
+    Return the top user mode on the stack (the next one that would be
+    executed) if there are any.
+    """
     stack_len = _len_torch_dispatch_stack()
-    # Return a user mode on the stack if there are any
     if stack_len > 0:
         return _get_dispatch_stack_at(stack_len - 1)
     return None
@@ -256,7 +259,12 @@ def _disable_infra_mode(key):
             _push_mode(mode_unset)
 
 
-def _get_current_dispatch_mode_stack():
+def _get_current_dispatch_mode_stack() -> list[TorchDispatchMode]:
+    """
+    Returns the current stack of dispatch modes, with the most recent
+    (i.e., the one that will be processed first) at the end of the
+    list (standard stack convention).
+    """
     stack_len = _len_torch_dispatch_stack()
     return [_get_dispatch_stack_at(i) for i in range(stack_len)]
 
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index 773e9f00e3d15..69f3ddf56b87f 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -20,7 +20,6 @@
 import importlib
 import importlib.metadata
 import json
-import sys
 import threading
 import types
 import warnings
@@ -1412,10 +1411,7 @@ def tree_map_(
 
 Type2 = tuple[type[T], type[S]]
 Type3 = tuple[type[T], type[S], type[U]]
-if sys.version_info >= (3, 10):
-    TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
-else:
-    TypeAny = Union[type[Any], tuple[type[Any], ...]]
+TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
 
 Fn2 = Callable[[Union[T, S]], R]
 Fn3 = Callable[[Union[T, S, U]], R]
@@ -1473,10 +1469,7 @@ def go(t):
 
     You can also directly use 'tree_map_only'
     """
-    if isinstance(type_or_types_or_pred, (type, tuple)) or (
-        sys.version_info >= (3, 10)
-        and isinstance(type_or_types_or_pred, types.UnionType)
-    ):
+    if isinstance(type_or_types_or_pred, (type, tuple, types.UnionType)):
 
         def pred(x: Any) -> bool:
             return isinstance(x, type_or_types_or_pred)  # type: ignore[arg-type]
diff --git a/torch/utils/_sympy/interp.py b/torch/utils/_sympy/interp.py
index 3b020b5fabbc7..29b7eb2cc22be 100644
--- a/torch/utils/_sympy/interp.py
+++ b/torch/utils/_sympy/interp.py
@@ -219,7 +219,7 @@ def sympy_interp(
                 missing_handler=missing_handler,
             )
             for arg in expr.args
-        ],  # type: ignore[arg-type]
+        ],
         expr,
         index_dtype=index_dtype,
-    )  # type: ignore[arg-type]
+    )
diff --git a/torch/utils/_sympy/solve.py b/torch/utils/_sympy/solve.py
index 334a023c0f36b..05a4d3abadee8 100644
--- a/torch/utils/_sympy/solve.py
+++ b/torch/utils/_sympy/solve.py
@@ -151,28 +151,28 @@ def _try_isolate_lhs(
         if isinstance(e, sympy.Eq):
             numerator, denominator = e.lhs.args
             return sympy.And(
-                sympy.Ge(numerator, (e.rhs * denominator)),  # type: ignore[arg-type]
-                sympy.Lt(numerator, ((e.rhs + 1) * denominator)),  # type: ignore[arg-type]
+                sympy.Ge(numerator, (e.rhs * denominator)),
+                sympy.Lt(numerator, ((e.rhs + 1) * denominator)),
             )
         # a // b != expr
         # => a < (b * expr) or a >= (b * (expr + 1))
         if isinstance(e, sympy.Ne):
             numerator, denominator = e.lhs.args
             return sympy.Or(
-                sympy.Lt(numerator, (e.rhs * denominator)),  # type: ignore[arg-type]
-                sympy.Ge(numerator, ((e.rhs + 1) * denominator)),  # type: ignore[arg-type]
+                sympy.Lt(numerator, (e.rhs * denominator)),
+                sympy.Ge(numerator, ((e.rhs + 1) * denominator)),
             )
         # The transformations below only work if b is positive.
         # Note: we only have this information for constants.
         # a // b > expr  => a >= b * (expr + 1)
         # a // b >= expr => a >= b * expr
         if isinstance(e, (sympy.Gt, sympy.Ge)):
-            quotient = e.rhs if isinstance(e, sympy.Ge) else (e.rhs + 1)  # type: ignore[arg-type]
-            return sympy.Ge(e.lhs.args[0], (quotient * e.lhs.args[1]))  # type: ignore[arg-type]
+            quotient = e.rhs if isinstance(e, sympy.Ge) else (e.rhs + 1)
+            return sympy.Ge(e.lhs.args[0], (quotient * e.lhs.args[1]))
         # a // b < expr  => a < b * expr
         # a // b <= expr => a < b * (expr + 1)
         if isinstance(e, (sympy.Lt, sympy.Le)):
-            quotient = e.rhs if isinstance(e, sympy.Lt) else (e.rhs + 1)  # type: ignore[arg-type]
-            return sympy.Lt(e.lhs.args[0], (quotient * e.lhs.args[1]))  # type: ignore[arg-type]
+            quotient = e.rhs if isinstance(e, sympy.Lt) else (e.rhs + 1)
+            return sympy.Lt(e.lhs.args[0], (quotient * e.lhs.args[1]))
 
     return e
diff --git a/torch/utils/_triton.py b/torch/utils/_triton.py
index 7d545e8221643..5f0ca5b4eff8d 100644
--- a/torch/utils/_triton.py
+++ b/torch/utils/_triton.py
@@ -105,6 +105,21 @@ def has_triton_tma_device() -> bool:
     return False
 
 
+@functools.cache
+def has_datacenter_blackwell_tma_device() -> bool:
+    import torch
+
+    if (
+        torch.cuda.is_available()
+        and torch.cuda.get_device_capability() >= (10, 0)
+        and torch.cuda.get_device_capability() < (11, 0)
+        and not torch.version.hip
+    ):
+        return has_triton_tma_device() and has_triton_tensor_descriptor_host_tma()
+
+    return False
+
+
 @functools.lru_cache(None)
 def has_triton_stable_tma_api() -> bool:
     if has_triton_package():
@@ -129,6 +144,11 @@ def has_triton() -> bool:
     if not has_triton_package():
         return False
 
+    from torch._inductor.config import triton_disable_device_detection
+
+    if triton_disable_device_detection:
+        return False
+
     from torch._dynamo.device_interface import get_interface_for_device
 
     def cuda_extra_check(device_interface: Any) -> bool:
diff --git a/torch/utils/backend_registration.py b/torch/utils/backend_registration.py
index 5a83aede8d468..a4fcd949ee900 100644
--- a/torch/utils/backend_registration.py
+++ b/torch/utils/backend_registration.py
@@ -430,8 +430,8 @@ def func_name(*args, **kwargs):
         f"func_name must be `str`, but got `{type(func_name)}`."
     )
     backend_name = _get_privateuse1_backend_name()
-    custom_device_mod = getattr(torch, backend_name, None)  # type: ignore[arg-type]
-    function = getattr(custom_device_mod, func_name, None)  # type: ignore[arg-type]
+    custom_device_mod = getattr(torch, backend_name, None)
+    function = getattr(custom_device_mod, func_name, None)
     if custom_device_mod is None or function is None:
         message = f"Try to call torch.{backend_name}.{func_name}. The backend must register a custom backend "
         message += f"module with `torch._register_device_module('{backend_name}', BackendModule)`. And "
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index 1889f6756e70f..377fc1221dbe4 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -484,7 +484,7 @@ def collect_callgrind(
         the fact that a small number of iterations is generally sufficient to
         obtain good measurements.
 
-        In order to to use this method `valgrind`, `callgrind_control`, and
+        In order to use this method `valgrind`, `callgrind_control`, and
         `callgrind_annotate` must be installed.
 
         Because there is a process boundary between the caller (this process)
diff --git a/torch/utils/bottleneck/__main__.py b/torch/utils/bottleneck/__main__.py
deleted file mode 100644
index d8bc43be0e2bb..0000000000000
--- a/torch/utils/bottleneck/__main__.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# mypy: allow-untyped-defs
-import argparse
-import cProfile
-import pstats
-import sys
-import os
-
-import torch
-from torch.autograd import profiler
-from torch.utils.collect_env import get_env_info
-
-
-def redirect_argv(new_argv):
-    sys.argv[:] = new_argv[:]
-
-
-def compiled_with_cuda(sysinfo):
-    if sysinfo.cuda_compiled_version:
-        return f'compiled w/ CUDA {sysinfo.cuda_compiled_version}'
-    return 'not compiled w/ CUDA'
-
-
-env_summary = """
---------------------------------------------------------------------------------
-  Environment Summary
---------------------------------------------------------------------------------
-PyTorch {pytorch_version}{debug_str} {cuda_compiled}
-Running with Python {py_version} and {cuda_runtime}
-
-`{pip_version} list` truncated output:
-{pip_list_output}
-""".strip()
-
-
-def run_env_analysis():
-    print('Running environment analysis...')
-    info = get_env_info()
-
-    result: dict[str, str] = {}
-
-    debug_str = ''
-    if info.is_debug_build:
-        debug_str = ' DEBUG'
-
-    cuda_avail = ''
-    if info.is_cuda_available:
-        cuda = info.cuda_runtime_version
-        if cuda is not None:
-            cuda_avail = 'CUDA ' + cuda
-    else:
-        cuda = 'CUDA unavailable'
-
-    pip_version = info.pip_version
-    pip_list_output = info.pip_packages
-    if pip_list_output is None:
-        pip_list_output = 'Unable to fetch'
-
-    result = {
-        'debug_str': debug_str,
-        'pytorch_version': info.torch_version,
-        'cuda_compiled': compiled_with_cuda(info),
-        'py_version': f'{sys.version_info[0]}.{sys.version_info[1]}',
-        'cuda_runtime': cuda_avail,
-        'pip_version': pip_version,
-        'pip_list_output': pip_list_output,
-    }
-
-    return env_summary.format(**result)
-
-
-def run_cprofile(code, globs, launch_blocking=False):
-    print('Running your script with cProfile')
-    prof = cProfile.Profile()
-    prof.enable()
-    exec(code, globs, None)
-    prof.disable()
-    return prof
-
-
-cprof_summary = """
---------------------------------------------------------------------------------
-  cProfile output
---------------------------------------------------------------------------------
-""".strip()
-
-
-def print_cprofile_summary(prof, sortby='tottime', topk=15):
-    print(cprof_summary)
-    cprofile_stats = pstats.Stats(prof).sort_stats(sortby)
-    cprofile_stats.print_stats(topk)
-
-
-def run_autograd_prof(code, globs):
-    def run_prof(use_cuda=False):
-        with profiler.profile(use_cuda=use_cuda) as prof:
-            exec(code, globs, None)
-        return prof
-
-    print('Running your script with the autograd profiler...')
-    result = [run_prof(use_cuda=False)]
-    if torch.cuda.is_available():
-        result.append(run_prof(use_cuda=True))
-    else:
-        result.append(None)
-
-    return result
-
-
-autograd_prof_summary = """
---------------------------------------------------------------------------------
-  autograd profiler output ({mode} mode)
---------------------------------------------------------------------------------
-        {description}
-{cuda_warning}
-{output}
-""".strip()
-
-
-def print_autograd_prof_summary(prof, mode, sortby='cpu_time', topk=15):
-    valid_sortby = ['cpu_time', 'cuda_time', 'cpu_time_total', 'cuda_time_total', 'count']
-    if sortby not in valid_sortby:
-        warn = ('WARNING: invalid sorting option for autograd profiler results: {}\n'
-                'Expected `cpu_time`, `cpu_time_total`, or `count`. '
-                'Defaulting to `cpu_time`.')
-        print(warn.format(sortby))
-        sortby = 'cpu_time'
-
-    if mode == 'CUDA':
-        cuda_warning = ('\n\tBecause the autograd profiler uses the CUDA event API,\n'
-                        '\tthe CUDA time column reports approximately max(cuda_time, cpu_time).\n'
-                        '\tPlease ignore this output if your code does not use CUDA.\n')
-    else:
-        cuda_warning = ''
-
-    sorted_events = sorted(prof.function_events,
-                           key=lambda x: getattr(x, sortby), reverse=True)
-    topk_events = sorted_events[:topk]
-
-    result = {
-        'mode': mode,
-        'description': f'top {topk} events sorted by {sortby}',
-        'output': torch.autograd.profiler_util._build_table(topk_events),
-        'cuda_warning': cuda_warning
-    }
-
-    print(autograd_prof_summary.format(**result))
-
-
-descript = """
-`bottleneck` is a tool that can be used as an initial step for debugging
-bottlenecks in your program.
-
-It summarizes runs of your script with the Python profiler and PyTorch\'s
-autograd profiler. Because your script will be profiled, please ensure that it
-exits in a finite amount of time.
-
-For more complicated uses of the profilers, please see
-https://docs.python.org/3/library/profile.html and
-https://pytorch.org/docs/main/autograd.html#profiler for more information.
-""".strip()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description=descript)
-    parser.add_argument('scriptfile', type=str,
-                        help='Path to the script to be run. '
-                        'Usually run with `python path/to/script`.')
-    parser.add_argument('args', type=str, nargs=argparse.REMAINDER,
-                        help='Command-line arguments to be passed to the script.')
-    return parser.parse_args()
-
-
-def cpu_time_total(autograd_prof):
-    return sum(event.cpu_time_total for event in autograd_prof.function_events)
-
-
-def main():
-    args = parse_args()
-
-    # Customizable constants.
-    scriptfile = args.scriptfile
-    scriptargs = [] if args.args is None else args.args
-    scriptargs.insert(0, scriptfile)
-    cprofile_sortby = 'tottime'
-    cprofile_topk = 15
-    autograd_prof_sortby = 'cpu_time_total'
-    autograd_prof_topk = 15
-
-    redirect_argv(scriptargs)
-
-    sys.path.insert(0, os.path.dirname(scriptfile))
-    with open(scriptfile, 'rb') as stream:
-        code = compile(stream.read(), scriptfile, 'exec')
-    globs = {
-        '__file__': scriptfile,
-        '__name__': '__main__',
-        '__package__': None,
-        '__cached__': None,
-    }
-
-    print(descript)
-
-    env_summary = run_env_analysis()
-
-    if torch.cuda.is_available():
-        torch.cuda.init()
-    cprofile_prof = run_cprofile(code, globs)
-    autograd_prof_cpu, autograd_prof_cuda = run_autograd_prof(code, globs)
-
-    print(env_summary)
-    print_cprofile_summary(cprofile_prof, cprofile_sortby, cprofile_topk)
-
-    if not torch.cuda.is_available():
-        print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
-        return
-
-    # Print both the result of the CPU-mode and CUDA-mode autograd profilers
-    # if their execution times are very different.
-    cuda_prof_exec_time = cpu_time_total(autograd_prof_cuda)
-    if len(autograd_prof_cpu.function_events) > 0:
-        cpu_prof_exec_time = cpu_time_total(autograd_prof_cpu)
-        pct_diff = (cuda_prof_exec_time - cpu_prof_exec_time) / cuda_prof_exec_time
-        if abs(pct_diff) > 0.05:
-            print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
-
-    print_autograd_prof_summary(autograd_prof_cuda, 'CUDA', autograd_prof_sortby, autograd_prof_topk)
-
-if __name__ == '__main__':
-    main()
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 7202a9638756d..c2548a5c084c8 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -206,7 +206,7 @@ def _join_sycl_home(*paths) -> str:
     "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
     "Your compiler (%s) is not compatible with the compiler Pytorch was"
     "built with for this platform, which is %s on %s. Please"
-    "use %s to to compile your extension. Alternatively, you may"
+    "use %s to compile your extension. Alternatively, you may"
     "compile PyTorch from source using %s, and then you can also use"
     "%s to compile your extension."
     "See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help"
@@ -345,7 +345,7 @@ def _get_sycl_device_flags(cflags):
     'win-amd64' : 'x86_amd64',
 }
 
-min_supported_cpython = "0x03090000"  # Python 3.9 hexcode
+min_supported_cpython = "0x030A0000"  # Python 3.10 hexcode
 
 def get_cxx_compiler():
     if IS_WINDOWS:
@@ -1496,7 +1496,7 @@ def SyclExtension(name, sources, *args, **kwargs):
 
     return setuptools.Extension(name, sources, *args, **kwargs)
 
-def include_paths(device_type: str = "cpu") -> list[str]:
+def include_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str]:
     """
     Get the include paths required to build a C++ or CUDA or SYCL extension.
 
@@ -1505,12 +1505,14 @@ def include_paths(device_type: str = "cpu") -> list[str]:
     Returns:
         A list of include path strings.
     """
+    paths = []
     lib_include = os.path.join(_TORCH_PATH, 'include')
-    paths = [
-        lib_include,
-        # Remove this once torch/torch.h is officially no longer supported for C++ extensions.
-        os.path.join(lib_include, 'torch', 'csrc', 'api', 'include'),
-    ]
+    if torch_include_dirs:
+        paths.extend([
+            lib_include,
+            # Remove this once torch/torch.h is officially no longer supported for C++ extensions.
+            os.path.join(lib_include, 'torch', 'csrc', 'api', 'include'),
+        ])
     if device_type == "cuda" and IS_HIP_EXTENSION:
         paths.append(os.path.join(lib_include, 'THH'))
         paths.append(_join_rocm_home('include'))
@@ -1533,7 +1535,7 @@ def include_paths(device_type: str = "cpu") -> list[str]:
     return paths
 
 
-def library_paths(device_type: str = "cpu") -> list[str]:
+def library_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str]:
     """
     Get the library paths required to build a C++ or CUDA extension.
 
@@ -1543,8 +1545,12 @@ def library_paths(device_type: str = "cpu") -> list[str]:
     Returns:
         A list of library path strings.
     """
-    # We need to link against libtorch.so
-    paths = [TORCH_LIB_PATH]
+
+    paths = []
+
+    if torch_include_dirs:
+        # We need to link against libtorch.so
+        paths.extend([TORCH_LIB_PATH])
 
     if device_type == "cuda" and IS_HIP_EXTENSION:
         lib_dir = 'lib'
@@ -2311,10 +2317,10 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
 
         extra_ldflags.append('c10.lib')
         if with_cuda:
-            extra_ldflags.append('c10_cuda.lib')
+            extra_ldflags.append('c10_hip.lib' if IS_HIP_EXTENSION else 'c10_cuda.lib')
         extra_ldflags.append('torch_cpu.lib')
         if with_cuda:
-            extra_ldflags.append('torch_cuda.lib')
+            extra_ldflags.append('torch_hip.lib' if IS_HIP_EXTENSION else 'torch_cuda.lib')
             # /INCLUDE is used to ensure torch_cuda is linked against in a project that relies on it.
             # Related issue: https://github.com/pytorch/pytorch/issues/31611
             extra_ldflags.append('-INCLUDE:?warp_size@cuda@at@@YAHXZ')
@@ -2342,7 +2348,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
     if with_cuda:
         if verbose:
             logger.info('Detected CUDA files, patching ldflags')
-        if IS_WINDOWS:
+        if IS_WINDOWS and not IS_HIP_EXTENSION:
             extra_ldflags.append(f'/LIBPATH:{_join_cuda_home("lib", "x64")}')
             extra_ldflags.append('cudart.lib')
             if CUDNN_HOME is not None:
@@ -2359,8 +2365,12 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
             if CUDNN_HOME is not None:
                 extra_ldflags.append(f'-L{os.path.join(CUDNN_HOME, "lib64")}')
         elif IS_HIP_EXTENSION:
-            extra_ldflags.append(f'-L{_join_rocm_home("lib")}')
-            extra_ldflags.append('-lamdhip64')
+            if IS_WINDOWS:
+                extra_ldflags.append(f'/LIBPATH:{_join_rocm_home("lib")}')
+                extra_ldflags.append('amdhip64.lib')
+            else:
+                extra_ldflags.append(f'-L{_join_rocm_home("lib")}')
+                extra_ldflags.append('-lamdhip64')
     return extra_ldflags
 
 
@@ -2418,10 +2428,6 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
 
     # If not given or set as native, determine what's best for the GPU / CUDA version that can be found
     if not _arch_list or _arch_list == "native":
-        if not _arch_list:
-            logger.warning(
-                "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
-                "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.")
         arch_list = []
         # the assumption is that the extension should run on any of the currently visible cards,
         # which could be of different types - therefore all archs for visible cards should be included
@@ -2440,6 +2446,15 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
                 arch_list.append(arch)
         arch_list = sorted(arch_list)
         arch_list[-1] += '+PTX'
+
+        if not _arch_list:
+            # Only log on rank 0 in distributed settings to avoid spam
+            if not torch.distributed.is_available() or not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+                arch_list_str = ';'.join(arch_list)
+                logger.debug(
+                    "TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='%s' "
+                    "for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override.",
+                    arch_list_str)
     else:
         # Deal with lists that are ' ' separated (only deal with ';' after)
         _arch_list = _arch_list.replace(' ', ';')
@@ -2682,16 +2697,20 @@ def _write_ninja_file_to_build_library(path,
         common_cflags += [f'-isystem {shlex.quote(include)}' for include in system_includes]
 
     if IS_WINDOWS:
+        COMMON_HIP_FLAGS.extend(['-fms-runtime-lib=dll'])
         cflags = common_cflags + ['/std:c++17'] + extra_cflags
-        cflags += COMMON_HIP_FLAGS if IS_HIP_EXTENSION else COMMON_MSVC_FLAGS
+        cflags += COMMON_MSVC_FLAGS + (COMMON_HIP_FLAGS if IS_HIP_EXTENSION else [])
         cflags = _nt_quote_args(cflags)
     else:
         cflags = common_cflags + ['-fPIC', '-std=c++17'] + extra_cflags
 
     if with_cuda and IS_HIP_EXTENSION:
-        cuda_flags = ['-DWITH_HIP'] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS
+        cuda_flags = ['-DWITH_HIP'] + common_cflags + extra_cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS
+        cuda_flags = cuda_flags + ['-std=c++17']
         cuda_flags += _get_rocm_arch_flags(cuda_flags)
         cuda_flags += extra_cuda_cflags
+        if IS_WINDOWS:
+            cuda_flags = _nt_quote_args(cuda_flags)
     elif with_cuda:
         cuda_flags = common_cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags(extra_cuda_cflags)
         if IS_WINDOWS:
diff --git a/torch/utils/data/datapipes/_decorator.py b/torch/utils/data/datapipes/_decorator.py
index 0833f8fdf759b..507e00259c4c7 100644
--- a/torch/utils/data/datapipes/_decorator.py
+++ b/torch/utils/data/datapipes/_decorator.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import inspect
+from collections.abc import Callable
 from functools import wraps
-from typing import Any, Callable, get_type_hints, Optional, Union
+from typing import Any, get_type_hints, Optional, Union
 
 from torch.utils.data.datapipes._typing import _DataPipeMeta
 from torch.utils.data.datapipes.datapipe import IterDataPipe, MapDataPipe
@@ -74,9 +75,9 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
 class non_deterministic:
     cls: Optional[type[IterDataPipe]] = None
     # TODO: Lambda for picking
-    deterministic_fn: Callable[[], bool]
+    deterministic_fn: Callable[..., bool]
 
-    def __init__(self, arg: Union[type[IterDataPipe], Callable[[], bool]]) -> None:
+    def __init__(self, arg: Union[type[IterDataPipe], Callable[..., bool]]) -> None:
         # 1. Decorator doesn't have any argument
         if isinstance(arg, type):  # type: ignore[arg-type]
             if not issubclass(arg, IterDataPipe):  # type: ignore[arg-type]
@@ -91,7 +92,7 @@ def __init__(self, arg: Union[type[IterDataPipe], Callable[[], bool]]) -> None:
         #    When the function returns True, the instance is non-deterministic. Otherwise,
         #    the instance is a deterministic DataPipe.
         elif isinstance(arg, Callable):  # type:ignore[arg-type]
-            self.deterministic_fn = arg  # type: ignore[assignment, misc]
+            self.deterministic_fn = arg
         else:
             raise TypeError(f"{arg} can not be decorated by non_deterministic")
 
@@ -118,7 +119,7 @@ def __call__(self, *args, **kwargs):
         return self.deterministic_wrapper_fn
 
     def deterministic_wrapper_fn(self, *args, **kwargs) -> IterDataPipe:
-        res = self.deterministic_fn(*args, **kwargs)  # type: ignore[call-arg, misc]
+        res = self.deterministic_fn(*args, **kwargs)
         if not isinstance(res, bool):
             raise TypeError(
                 "deterministic_fn of `non_deterministic` decorator is required "
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index f92edd6b7b39c..063884d7bfa98 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -45,10 +45,8 @@ def __init__(
         self.datapipe = datapipe
         self.sampler_args = () if sampler_args is None else sampler_args
         self.sampler_kwargs = {} if sampler_kwargs is None else sampler_kwargs
-        # https://github.com/python/mypy/pull/9629 will solve
-        self.sampler = sampler(
-            *self.sampler_args, data_source=self.datapipe, **self.sampler_kwargs
-        )  # type: ignore[misc]
+        self.sampler_kwargs["data_source"] = self.datapipe
+        self.sampler = sampler(*self.sampler_args, **self.sampler_kwargs)
 
     def __iter__(self) -> Iterator[_T_co]:
         return iter(self.sampler)
diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py
index 3025b809e12df..5b627a190e8a8 100644
--- a/torch/utils/data/datapipes/iter/fileopener.py
+++ b/torch/utils/data/datapipes/iter/fileopener.py
@@ -1,5 +1,4 @@
-# mypy: allow-untyped-defs
-from collections.abc import Iterable
+from collections.abc import Iterable, Iterator
 from io import IOBase
 from typing import Optional
 
@@ -53,7 +52,7 @@ def __init__(
         length: int = -1,
     ):
         super().__init__()
-        self.datapipe: Iterable = datapipe
+        self.datapipe: Iterable[str] = datapipe
         self.mode: str = mode
         self.encoding: Optional[str] = encoding
 
@@ -70,12 +69,12 @@ def __init__(
     # Remove annotation due to 'IOBase' is a general type and true type
     # is determined at runtime based on mode. Some `DataPipe` requiring
     # a subtype would cause mypy error.
-    def __iter__(self):
+    def __iter__(self) -> Iterator[tuple[str, IOBase]]:
         yield from get_file_binaries_from_pathnames(
             self.datapipe, self.mode, self.encoding
         )
 
-    def __len__(self):
+    def __len__(self) -> int:
         if self.length == -1:
             raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
         return self.length
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 055d9c28b09be..e7e50d302e125 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -1,10 +1,8 @@
 # mypy: allow-untyped-defs
-import warnings
 from collections import defaultdict
 from collections.abc import Iterator, Sized
 from typing import Any, Callable, Optional, TypeVar
 
-import torch.utils.data.datapipes.iter.sharding
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import DataChunk, IterDataPipe
 from torch.utils.data.datapipes.utils.common import _check_unpickable_fn
@@ -21,16 +19,6 @@
 
 
 def __getattr__(name: str):
-    if name in ["SHARDING_PRIORITIES", "ShardingFilterIterDataPipe"]:
-        warnings.warn(
-            f"`{name}` from `torch.utils.data.datapipes.iter.grouping` is going to be removed in PyTorch 2.1"
-            f"Please use `{name}` from the `torch.utils.data.datapipes.iter.sharding`",
-            category=FutureWarning,
-            stacklevel=2,
-        )
-
-        return getattr(torch.utils.data.datapipes.iter.sharding, name)
-
     raise AttributeError(f"module {__name__} has no attribute {name}")
 
 
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index e8164e015a668..a5c6ccaae9004 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -321,7 +321,7 @@ def cumsum(sequence):
     def __init__(self, datasets: Iterable[Dataset]) -> None:
         super().__init__()
         self.datasets = list(datasets)
-        assert len(self.datasets) > 0, "datasets should not be an empty iterable"  # type: ignore[arg-type]
+        assert len(self.datasets) > 0, "datasets should not be an empty iterable"
         for d in self.datasets:
             assert not isinstance(d, IterableDataset), (
                 "ConcatDataset does not support IterableDataset"
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index 6c2e6dcaf2f45..81f05a936df8f 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -32,10 +32,6 @@ class Sampler(Generic[_T_co]):
     way to iterate over indices or lists of indices (batches) of dataset elements,
     and may provide a :meth:`__len__` method that returns the length of the returned iterators.
 
-    Args:
-        data_source (Dataset): This argument is not used and will be removed in 2.2.0.
-            You may still have custom implementation that utilizes it.
-
     Example:
         >>> # xdoctest: +SKIP
         >>> class AccedingSequenceLengthSampler(Sampler[int]):
@@ -67,15 +63,6 @@ class Sampler(Generic[_T_co]):
               calculation involving the length of a :class:`~torch.utils.data.DataLoader`.
     """
 
-    def __init__(self, data_source: Optional[Sized] = None) -> None:
-        if data_source is not None:
-            import warnings
-
-            warnings.warn(
-                "`data_source` argument is not used and will be removed in 2.2.0."
-                "You may still have custom implementation that utilizes it."
-            )
-
     def __iter__(self) -> Iterator[_T_co]:
         raise NotImplementedError
 
@@ -111,7 +98,7 @@ class SequentialSampler(Sampler[int]):
     r"""Samples elements sequentially, always in the same order.
 
     Args:
-        data_source (Dataset): dataset to sample from
+        data_source (Sized): data source to sample from. Must implement __len__.
     """
 
     data_source: Sized
@@ -132,7 +119,7 @@ class RandomSampler(Sampler[int]):
     If with replacement, then user can specify :attr:`num_samples` to draw.
 
     Args:
-        data_source (Dataset): dataset to sample from
+        data_source (Sized): data source to sample from. Must implement __len__.
         replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
         num_samples (int): number of samples to draw, default=`len(dataset)`.
         generator (Generator): Generator used in sampling.
diff --git a/torch/utils/tensorboard/_convert_np.py b/torch/utils/tensorboard/_convert_np.py
index 4e20ec6337c36..afa801343334b 100644
--- a/torch/utils/tensorboard/_convert_np.py
+++ b/torch/utils/tensorboard/_convert_np.py
@@ -20,6 +20,8 @@ def make_np(x: torch.Tensor) -> np.ndarray:
     if np.isscalar(x):
         return np.array([x])
     if isinstance(x, torch.Tensor):
+        if x.device.type == "meta":
+            return np.random.randn(1)
         return _prepare_pytorch(x)
     raise NotImplementedError(
         f"Got {type(x)}, but numpy array or torch tensor are expected."
diff --git a/torch/xpu/__init__.py b/torch/xpu/__init__.py
index 79aae38a31685..6e15bf4380e3e 100644
--- a/torch/xpu/__init__.py
+++ b/torch/xpu/__init__.py
@@ -280,6 +280,22 @@ def _get_device(device: Union[int, str, torch.device]) -> torch.device:
     return device
 
 
+def can_device_access_peer(device: _device_t, peer: _device_t) -> bool:
+    r"""Query whether a device can access a peer device's memory.
+
+    Args:
+        device (torch.device or int or str): selected device.
+        peer (torch.device or int or str): peer device to query access to.
+
+    Returns:
+        bool: ``True`` if ``device`` can access ``peer``, ``False`` otherwise.
+    """
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    peer = _get_device_index(peer, optional=True)
+    return torch._C._xpu_canDeviceAccessPeer(device, peer)
+
+
 class StreamContext:
     r"""Context-manager that selects a given stream.
 
@@ -518,6 +534,7 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
     "Event",
     "Stream",
     "StreamContext",
+    "can_device_access_peer",
     "current_device",
     "current_stream",
     "default_generators",
diff --git a/torchgen/utils.py b/torchgen/utils.py
index 905d6fd0c0b6c..ced051e176f7c 100644
--- a/torchgen/utils.py
+++ b/torchgen/utils.py
@@ -7,9 +7,10 @@
 import re
 import sys
 import textwrap
-from dataclasses import fields, is_dataclass
+from dataclasses import is_dataclass
 from enum import auto, Enum
 from pathlib import Path
+from pprint import pformat
 from typing import Any, Callable, Generic, Literal, NoReturn, TYPE_CHECKING, TypeVar
 from typing_extensions import assert_never, deprecated, Self
 
@@ -354,48 +355,7 @@ def dataclass_repr(
     indent: int = 0,
     width: int = 80,
 ) -> str:
-    # built-in pprint module support dataclasses from python 3.10
-    if sys.version_info >= (3, 10):
-        from pprint import pformat
-
-        return pformat(obj, indent, width)
-
-    return _pformat(obj, indent=indent, width=width)
-
-
-def _pformat(
-    obj: Any,
-    indent: int,
-    width: int,
-    curr_indent: int = 0,
-) -> str:
-    assert is_dataclass(obj), f"obj should be a dataclass, received: {type(obj)}"
-
-    class_name = obj.__class__.__name__
-    # update current indentation level with class name
-    curr_indent += len(class_name) + 1
-
-    fields_list = [(f.name, getattr(obj, f.name)) for f in fields(obj) if f.repr]
-
-    fields_str = []
-    for name, attr in fields_list:
-        # update the current indent level with the field name
-        # dict, list, set and tuple also add indent as done in pprint
-        _curr_indent = curr_indent + len(name) + 1
-        if is_dataclass(attr):
-            str_repr = _pformat(attr, indent, width, _curr_indent)
-        elif isinstance(attr, dict):
-            str_repr = _format_dict(attr, indent, width, _curr_indent)
-        elif isinstance(attr, (list, set, tuple)):
-            str_repr = _format_list(attr, indent, width, _curr_indent)
-        else:
-            str_repr = repr(attr)
-
-        fields_str.append(f"{name}={str_repr}")
-
-    indent_str = curr_indent * " "
-    body = f",\n{indent_str}".join(fields_str)
-    return f"{class_name}({body})"
+    return pformat(obj, indent, width)
 
 
 def _format_dict(
@@ -409,7 +369,7 @@ def _format_dict(
     for k, v in attr.items():
         k_repr = repr(k)
         v_str = (
-            _pformat(v, indent, width, curr_indent + len(k_repr))
+            pformat(v, indent, width, curr_indent + len(k_repr))
             if is_dataclass(v)
             else repr(v)
         )
@@ -426,7 +386,7 @@ def _format_list(
 ) -> str:
     curr_indent += indent + 1
     list_repr = [
-        _pformat(l, indent, width, curr_indent) if is_dataclass(l) else repr(l)
+        pformat(l, indent, width, curr_indent) if is_dataclass(l) else repr(l)
         for l in attr
     ]
     start, end = ("[", "]") if isinstance(attr, list) else ("(", ")")
diff --git a/version.txt b/version.txt
index 03e905f0db5fe..1e8c33284d92d 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-2.9.0a0
+2.10.0a0