diff --git a/README.md b/README.md
index 22e879dc..4ce0f750 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,14 @@ Summary: PyTorch is an optimized tensor library for deep learning using GPUs and
 
 Development: https://github.com/pytorch/pytorch
 
+Documentation: https://pytorch.org/docs/
+
+PyTorch is a Python package that provides two high-level features:
+  - Tensor computation (like NumPy) with strong GPU acceleration
+  - Deep neural networks built on a tape-based autograd system
+You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed.
+
+
 Current build status
 ====================
 
diff --git a/recipe/bld.bat b/recipe/bld.bat
index e4d0bae5..5f6f57c6 100644
--- a/recipe/bld.bat
+++ b/recipe/bld.bat
@@ -7,6 +7,12 @@ if EXIST pyproject.toml (
   if %ERRORLEVEL% neq 0 exit 1
 )
 
+@REM The PyTorch test suite includes some symlinks, which aren't resolved on Windows, leading to packaging errors.
+@REM ATTN! These change and have to be updated manually, often with each release.
+@REM (no current symlinks being packaged. Leaving this information here as it took some months to find the issue. Look out
+@REM for a failure with error message: "conda_package_handling.exceptions.ArchiveCreationError: <somefile> Cannot stat
+@REM while writing file")
+
 set PYTORCH_BUILD_VERSION=%PKG_VERSION%
 @REM Always pass 0 to avoid appending ".post" to version string.
 @REM https://github.com/conda-forge/pytorch-cpu-feedstock/issues/315
@@ -97,6 +103,10 @@ if not "%cuda_compiler_version%" == "None" (
 
 set DISTUTILS_USE_SDK=1
 
+@REM Use our Pybind11, Eigen
+set USE_SYSTEM_PYBIND11=1
+set USE_SYSTEM_EIGEN_INSTALL=1
+
 set CMAKE_INCLUDE_PATH=%LIBRARY_PREFIX%\include
 set LIB=%LIBRARY_PREFIX%\lib;%LIB%
 
@@ -128,7 +138,7 @@ set "USE_LITE_PROTO=ON"
 set "USE_OPENMP=OFF"
 
 @REM The activation script for cuda-nvcc doesnt add the CUDA_CFLAGS on windows.
-@REM Therefor we do this manually here. See:
+@REM Therefore we do this manually here. See:
 @REM https://github.com/conda-forge/cuda-nvcc-feedstock/issues/47
 echo "CUDA_CFLAGS=%CUDA_CFLAGS%"
 set "CUDA_CFLAGS=-I%PREFIX%/Library/include -I%BUILD_PREFIX%/Library/include"
@@ -183,19 +193,12 @@ if "%PKG_NAME%" == "libtorch" (
     pushd torch-%PKG_VERSION%
     if %ERRORLEVEL% neq 0 exit 1
 
-    @REM Do not package `fmt.lib` (and its metadata); delete it before the move into
-    @REM %LIBRARY_BIN% because it may exist in host before installation already
-    del torch\lib\fmt.lib torch\lib\pkgconfig\fmt.pc
-    if %ERRORLEVEL% neq 0 exit 1
-    @REM also delete rest of fmt metadata
-    rmdir /s /q torch\lib\cmake\fmt
-
     @REM Move the binaries into the packages site-package directory
     @REM the only content of torch\bin, {asmjit,fbgemm}.dll, also exists in torch\lib
-    robocopy /NP /NFL /NDL /NJH /E torch\lib\ %LIBRARY_BIN%\ torch*.dll c10.dll shm.dll asmjit.dll fbgemm.dll
+    robocopy /NP /NFL /NDL /NJH /E torch\bin\ %LIBRARY_BIN%\ torch*.dll c10.dll shm.dll asmjit.dll fbgemm.dll
     robocopy /NP /NFL /NDL /NJH /E torch\lib\ %LIBRARY_LIB%\ torch*.lib c10.lib shm.lib asmjit.lib fbgemm.lib
     if not "%cuda_compiler_version%" == "None" (
-        robocopy /NP /NFL /NDL /NJH /E torch\lib\ %LIBRARY_BIN%\ c10_cuda.dll caffe2_nvrtc.dll
+        robocopy /NP /NFL /NDL /NJH /E torch\bin\ %LIBRARY_BIN%\ c10_cuda.dll caffe2_nvrtc.dll
         robocopy /NP /NFL /NDL /NJH /E torch\lib\ %LIBRARY_LIB%\ c10_cuda.lib caffe2_nvrtc.lib
     )
     robocopy /NP /NFL /NDL /NJH /E torch\share\ %LIBRARY_PREFIX%\share
@@ -216,7 +219,7 @@ if "%PKG_NAME%" == "libtorch" (
     if %ERRORLEVEL% neq 0 exit 1
 ) else if "%PKG_NAME%" == "pytorch" (
     @REM Move libtorch_python and remove the other directories afterwards.
-    robocopy /NP /NFL /NDL /NJH /E %SP_DIR%\torch\lib\ %LIBRARY_BIN%\ torch_python.dll
+    robocopy /NP /NFL /NDL /NJH /E %SP_DIR%\torch\bin\ %LIBRARY_BIN%\ torch_python.dll
     robocopy /NP /NFL /NDL /NJH /E %SP_DIR%\torch\lib\ %LIBRARY_LIB%\ torch_python.lib
     robocopy /NP /NFL /NDL /NJH /E %SP_DIR%\torch\lib\ %LIBRARY_LIB%\ _C.lib
     rmdir /s /q %SP_DIR%\torch\lib
diff --git a/recipe/build.sh b/recipe/build.sh
index 57044b09..648763a1 100644
--- a/recipe/build.sh
+++ b/recipe/build.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 
-echo "=== Building ${PKG_NAME} (py: ${PY_VER}) ==="
-
 set -ex
 
+echo "#########################################################################"
+echo "Building ${PKG_NAME} (py: ${PY_VER}) using BLAS implementation $blas_impl"
+echo "#########################################################################"
+
 # This is used to detect if it's in the process of building pytorch
 export IN_PYTORCH_BUILD=1
 
@@ -20,9 +22,22 @@ rm -rf pyproject.toml
 export USE_CUFILE=0
 export USE_NUMA=0
 export USE_ITT=0
+
+#################### ADJUST COMPILER AND LINKER FLAGS #####################
+# Pytorch's build system doesn't like us setting the c++ standard through CMAKE_CXX_FLAGS
+# and will issue a warning.  We need to use at least C++17 to match the abseil ABI, see
+# https://github.com/conda-forge/abseil-cpp-feedstock/issues/45, which pytorch 2.5 uses already:
+# https://github.com/pytorch/pytorch/blob/v2.5.1/CMakeLists.txt#L36-L48
+export CXXFLAGS="$(echo $CXXFLAGS | sed 's/-std=c++[0-9][0-9]//g')"
+# The below three lines expose symbols that would otherwise be hidden or
+# optimised away. They were here before, so removing them would potentially
+# break users' programs
 export CFLAGS="$(echo $CFLAGS | sed 's/-fvisibility-inlines-hidden//g')"
 export CXXFLAGS="$(echo $CXXFLAGS | sed 's/-fvisibility-inlines-hidden//g')"
 export LDFLAGS="$(echo $LDFLAGS | sed 's/-Wl,--as-needed//g')"
+# The default conda LDFLAGs include -Wl,-dead_strip_dylibs, which removes all the
+# MKL sequential, core, etc. libraries, resulting in a "Symbol not found: _mkl_blas_caxpy"
+# error on osx-64.
 export LDFLAGS="$(echo $LDFLAGS | sed 's/-Wl,-dead_strip_dylibs//g')"
 export LDFLAGS_LD="$(echo $LDFLAGS_LD | sed 's/-dead_strip_dylibs//g')"
 if [[ "$c_compiler" == "clang" ]]; then
@@ -45,6 +60,7 @@ fi
 # can be imported on system without a GPU
 LDFLAGS="${LDFLAGS//-Wl,-z,now/-Wl,-z,lazy}"
 
+################ CONFIGURE CMAKE FOR CONDA ENVIRONMENT ###################
 export CMAKE_GENERATOR=Ninja
 export CMAKE_LIBRARY_PATH=$PREFIX/lib:$PREFIX/include:$CMAKE_LIBRARY_PATH
 export CMAKE_PREFIX_PATH=$PREFIX
@@ -73,6 +89,8 @@ export USE_SYSTEM_SLEEF=1
 # use our protobuf
 export BUILD_CUSTOM_PROTOBUF=OFF
 rm -rf $PREFIX/bin/protoc
+export USE_SYSTEM_PYBIND11=1
+export USE_SYSTEM_EIGEN_INSTALL=1
 
 # prevent six from being downloaded
 > third_party/NNPACK/cmake/DownloadSix.cmake
@@ -98,18 +116,29 @@ if [[ "${CI}" == "github_actions" ]]; then
     # reduce parallelism to avoid getting OOM-killed on
     # cirun-openstack-gpu-2xlarge, which has 32GB RAM, 8 CPUs
     export MAX_JOBS=4
-else
+elif [[ "${CI}" == "azure" ]]; then
     export MAX_JOBS=${CPU_COUNT}
-fi
-
-if [[ "$blas_impl" == "generic" ]]; then
-    # Fake openblas
-    export BLAS=OpenBLAS
-    export OpenBLAS_HOME=${PREFIX}
 else
-    export BLAS=MKL
+    # Leave a spare core for other tasks, per common practice.
+    # Reducing further can help with out-of-memory errors.
+    export MAX_JOBS=$((CPU_COUNT > 1 ? CPU_COUNT - 1 : 1))
 fi
 
+case "$blas_impl" in
+    "generic")
+        # Fake openblas
+        export BLAS=OpenBLAS
+        export OpenBLAS_HOME=${PREFIX}
+        ;;
+    "mkl")
+        export BLAS=MKL
+        ;;
+    *)
+        echo "[ERROR] Unsupported BLAS implementation '${blas_impl}'" >&2
+        exit 1
+        ;;
+esac
+
 if [[ "$PKG_NAME" == "pytorch" ]]; then
   # Trick Cmake into thinking python hasn't changed
   sed "s/3\.12/$PY_VER/g" build/CMakeCache.txt.orig > build/CMakeCache.txt
@@ -163,12 +192,24 @@ elif [[ ${cuda_compiler_version} != "None" ]]; then
             echo "unknown CUDA arch, edit build.sh"
             exit 1
     esac
+
+    # Compatibility matrix for update: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+    # Warning from pytorch v1.12.1: In the future we will require one to
+    # explicitly pass TORCH_CUDA_ARCH_LIST to cmake instead of implicitly
+    # setting it as an env variable.
+    # Doing this is nontrivial given that we're using setup.py as an entry point, but should
+    # be addressed to pre-empt upstream changing it, as it probably won't result in a failed
+    # configuration.
+    #
+    # See:
+    # https://pytorch.org/docs/stable/cpp_extension.html (Compute capabilities)
+    # https://github.com/pytorch/pytorch/blob/main/.ci/manywheel/build_cuda.sh
     case ${cuda_compiler_version} in
-        12.6)
+        12.[0-6])
             export TORCH_CUDA_ARCH_LIST="5.0;6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0+PTX"
             ;;
         *)
-            echo "unsupported cuda version. edit build.sh"
+            echo "No CUDA architecture list exists for CUDA v${cuda_compiler_version}. See build.sh for information on adding one."
             exit 1
     esac
     export TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
@@ -203,7 +244,8 @@ case ${PKG_NAME} in
 
     mv build/lib.*/torch/bin/* ${PREFIX}/bin/
     mv build/lib.*/torch/lib/* ${PREFIX}/lib/
-    mv build/lib.*/torch/share/* ${PREFIX}/share/
+    # need to merge these now because we're using system pybind11, meaning the destination directory is not empty
+    rsync -a build/lib.*/torch/share/* ${PREFIX}/share/
     mv build/lib.*/torch/include/{ATen,caffe2,tensorpipe,torch,c10} ${PREFIX}/include/
     rm ${PREFIX}/lib/libtorch_python.*
 
@@ -211,7 +253,7 @@ case ${PKG_NAME} in
     cp build/CMakeCache.txt build/CMakeCache.txt.orig
     ;;
   pytorch)
-    $PREFIX/bin/python -m pip install . --no-deps -vvv --no-clean \
+    $PREFIX/bin/python -m pip install . --no-deps --no-build-isolation -vvv --no-clean \
         | sed "s,${CXX},\$\{CXX\},g" \
         | sed "s,${PREFIX},\$\{PREFIX\},g"
     # Keep this in ${PREFIX}/lib so that the library can be found by
diff --git a/recipe/cmake_test/CMakeLists.txt b/recipe/cmake_test/CMakeLists.txt
new file mode 100644
index 00000000..71684544
--- /dev/null
+++ b/recipe/cmake_test/CMakeLists.txt
@@ -0,0 +1,4 @@
+project(cf_dummy LANGUAGES C CXX)
+cmake_minimum_required(VERSION 3.12)
+find_package(Torch CONFIG REQUIRED)
+find_package(ATen CONFIG REQUIRED)
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index d5fc48f5..81df0c73 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -1,7 +1,10 @@
 # if you wish to build release candidate number X, append the version string with ".rcX"
 {% set version = "2.5.1" %}
-{% set build = 10 %}
+{% set build = 11 %}
 
+# Use a higher build number for the CUDA variant, to ensure that it's
+# preferred by conda's solver, and it's preferentially
+# installed where the platform supports it.
 {% if cuda_compiler_version != "None" %}
 {% set build = build + 200 %}
 {% endif %}
@@ -64,6 +67,12 @@ source:
     - patches/0015-simplify-torch.utils.cpp_extension.include_paths-use.patch
     # point to headers that are now living in $PREFIX/include instead of $SP_DIR/torch/include
     - patches/0016-point-include-paths-to-PREFIX-include.patch
+    - patches/0017-Add-conda-prefix-to-inductor-include-paths.patch
+    - patches/0018-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
+    - patches/0019-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch               # [win]
+    # backport https://github.com/pytorch/pytorch/pull/138579.patch
+    - patches/0020-inductor-Enable-cpp-wrapper-for-test_torchinductor-1.patch
+    - patches_submodules/0001-remove-DESTINATION-lib-from-CMake-install-directives.patch    # [win]
 
 build:
   number: {{ build }}
@@ -117,6 +126,7 @@ requirements:
     - protobuf
     - make      # [linux]
     - sccache   # [win]
+    - rsync     # [unix]
   host:
     # GPU requirements
     - cudnn                           # [cuda_compiler_version != "None"]
@@ -167,6 +177,8 @@ requirements:
     - libuv
     - pkg-config  # [unix]
     - typing_extensions
+    - pybind11
+    - eigen
   run:
     # GPU requirements without run_exports
     - {{ pin_compatible('cudnn') }}                       # [cuda_compiler_version != "None"]
@@ -192,6 +204,16 @@ requirements:
 # a particularity of conda-build, that output is defined in
 # the global build stage, including tests
 test:
+  requires:
+    # cmake needs a compiler to run package detection, see
+    # https://discourse.cmake.org/t/questions-about-find-package-cli-msvc/6194
+    - {{ compiler('cxx') }}
+    - {{ compiler('cuda') }}    # [cuda_compiler_version != "None"]
+    - cmake
+    - ninja
+    - pkg-config
+  files:
+    - cmake_test/
   commands:
     # libraries; peculiar formatting to avoid linter false positives about selectors
     {% set torch_libs = [
@@ -217,6 +239,11 @@ test:
     - test -f $PREFIX/share/cmake/Torch/TorchConfig.cmake                       # [linux]
     - if not exist %LIBRARY_PREFIX%\share\cmake\Torch\TorchConfig.cmake exit 1  # [win]
 
+    # test integrity of CMake metadata
+    - cd cmake_test
+    - cmake -GNinja -DCMAKE_CXX_STANDARD=17 $CMAKE_ARGS .   # [unix]
+    - cmake -GNinja -DCMAKE_CXX_STANDARD=17 %CMAKE_ARGS% .  # [win]
+
 outputs:
   - name: libtorch
   - name: pytorch
@@ -299,6 +326,8 @@ outputs:
         - pkg-config  # [unix]
         - typing_extensions
         - {{ pin_subpackage('libtorch', exact=True) }}
+        - pybind11
+        - eigen
       run:
         - llvm-openmp    # [osx]
         - intel-openmp {{ mkl }}  # [win]
@@ -314,6 +343,7 @@ outputs:
         - filelock
         - jinja2
         - networkx
+        - pybind11
         - nomkl                 # [blas_impl != "mkl"]
         - fsspec
         # avoid that people without GPUs needlessly download ~0.5-1GB
@@ -360,6 +390,7 @@ outputs:
         # tools/ is needed to optimise test run
         # as of pytorch=2.0.0, there is a bug when trying to run tests without the tools
         - tools
+        #- .ci/pytorch/smoke_test/smoke_test.py
       commands:
         # Run pip check so as to ensure that all pytorch packages are installed
         # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/24
@@ -367,6 +398,15 @@ outputs:
         - python -c "import torch; print(torch.__version__)"
         - python -c "import torch; assert torch.backends.mkldnn.m.is_available()"  # [x86 and cuda_compiler_version == "None"]
         - python -c "import torch; torch.tensor(1).to('cpu').numpy(); print('numpy support enabled!!!')"
+        # We have had issues with openmp .dylibs being doubly loaded in certain cases. These two tests catch the (observed) issue
+        - python -c "import torch; import numpy"
+        - python -c "import numpy; import torch"
+        # distributed support is enabled by default on linux; for mac, we enable it manually in build.sh
+        - python -c "import torch; assert torch.distributed.is_available()"        # [linux or osx]
+        - python -c "import torch; assert torch.backends.cuda.is_built()"          # [linux64 and (cuda_compiler_version != "None")]
+        - python -c "import torch; assert torch.backends.cudnn.is_available()"     # [linux64 and (cuda_compiler_version != "None")]
+        - python -c "import torch; assert torch.cuda.is_available()"               # [linux64 and (cuda_compiler_version != "None")]
+        - python -c "import torch; assert torch.backends.cudnn.enabled"            # [linux64 and (cuda_compiler_version != "None")]
         # At conda-forge, we target versions of OSX that are too old for MPS support
         # But if users install a newer version of OSX, they will have MPS support
         # https://github.com/conda-forge/pytorch-cpu-feedstock/pull/123#issuecomment-1186355073
@@ -377,8 +417,32 @@ outputs:
         - if not exist %LIBRARY_BIN%\torch_python.dll exit 1  # [win]
         - if not exist %LIBRARY_LIB%\torch_python.lib exit 1  # [win]
 
+        # See here for environment variables needed by the smoke test script
+        # https://github.com/pytorch/pytorch/blob/266fd35c5842902f6304aa8e7713b252cbfb243c/.ci/pytorch/smoke_test/smoke_test.py#L16
+        - set MATRIX_GPU_ARCH_VERSION="{{ '.'.join((cuda_compiler_version or "").split('.')[:2]) }}"   # [(cuda_compiler_version != "None") and (win)]
+        - set MATRIX_GPU_ARCH_TYPE="cuda"                                                       # [(cuda_compiler_version != "None") and (win)]
+        - set MATRIX_GPU_ARCH_VERSION="none"                                                    # [(cuda_compiler_version == "None") and (win)]
+        - set MATRIX_GPU_ARCH_TYPE="none"                                                       # [(cuda_compiler_version == "None") and (win)]
+        - set MATRIX_CHANNEL="defaults"                                                         # [win]
+        - set MATRIX_STABLE_VERSION={{ version }}                                               # [win]
+        - set MATRIX_PACKAGE_TYPE="conda"                                                       # [win]
+        - set TARGET_OS="windows"                                                               # [win]
+        - set OMP_NUM_THREADS=4                                                                 # [win]
+        - export MATRIX_GPU_ARCH_VERSION="{{ '.'.join((cuda_compiler_version or "").split('.')[:2]) }}"  # [(cuda_compiler_version != "None") and (linux and x86_64)]
+        - export MATRIX_GPU_ARCH_TYPE="cuda"                                                    # [(cuda_compiler_version != "None") and (linux and x86_64)]
+        - export MATRIX_GPU_ARCH_VERSION="none"                                                 # [(cuda_compiler_version == "None") and (not win)]
+        - export MATRIX_GPU_ARCH_TYPE="none"                                                    # [(cuda_compiler_version == "None") and (not win)]
+        - export MATRIX_CHANNEL="defaults"                                                      # [not win]
+        - export MATRIX_STABLE_VERSION="{{ version }}"                                          # [not win]
+        - export MATRIX_PACKAGE_TYPE="conda"                                                    # [not win]
+        - export TARGET_OS="linux"                                                              # [linux]
+        - export TARGET_OS="macos-arm64"                                                        # [(osx and arm64)]
+        - export TARGET_OS="macos-x86_64"                                                       # [(osx and x86_64)]
+        - export OMP_NUM_THREADS=4                                                              # [not win]
+        #- python ./smoke_test/smoke_test.py --package torchonly
+
         # a reasonably safe subset of tests that should run under 15 minutes
-        # disable hypothesis because it randomly yields health check errors
+        # The inductor tests test torch.compile
         {% set tests = " ".join([
             "test/test_autograd.py",
             "test/test_autograd_fallback.py",
@@ -389,8 +453,7 @@ outputs:
             "test/test_nn.py",
             "test/test_torch.py",
             "test/test_xnnpack_integration.py",
-            "-m \"not hypothesis\"",
-        ]) %}
+        ] + (cuda_compiler_version != "None") * ["test/inductor/test_torchinductor.py"]) %}
 
         {% set skips = "(TestTorch and test_print)" %}
         # tolerance violation with openblas
@@ -416,6 +479,9 @@ outputs:
         {% set skips = skips ~ " or test_BCELoss_weights_no_reduce_cuda" %}             # [unix and cuda_compiler_version != "None"]
         {% set skips = skips ~ " or test_ctc_loss_cudnn_tensor_cuda " %}                # [unix and cuda_compiler_version != "None"]
         {% set skips = skips ~ " or (TestTorch and test_index_add_correctness)" %}      # [unix and cuda_compiler_version != "None"]
+        # These tests require higher-resource or more recent GPUs than the CI provides
+        {% set skips = skips ~ " or (TritonCodeGenTests and test_sdpa_inference_mode_aot_compile)" %}   # [unix and cuda_compiler_version != "None"]
+        {% set skips = skips ~ " or (TestNN and test_grid_sample)" %}                                   # [unix and cuda_compiler_version != "None"]
         # MKL problems
         {% set skips = skips ~ " or (TestLinalgCPU and test_inverse_errors_large_cpu)" %}  # [unix and blas_impl == "mkl" and cuda_compiler_version != "None"]
         # these tests are failing with low -n values
@@ -438,8 +504,9 @@ outputs:
         # for potential packaging problems by running a fixed subset
         - export OMP_NUM_THREADS=4  # [unix]
         # reduced paralellism to avoid OOM; test only one python version on aarch because emulation is super-slow
-        - python -m pytest -n 2 {{ tests }} -k "not ({{ skips }})" --durations=50   # [unix and (not aarch64 or py==312)]
-        - python -m pytest -v -s {{ tests }} -k "not ({{ skips }})" --durations=50  # [win]
+        # disable hypothesis because it randomly yields health check errors
+        - python -m pytest -n 2 {{ tests }} -k "not ({{ skips }})" -m "not hypothesis" --durations=50   # [unix and (not aarch64 or py==312)]
+        - python -m pytest -v -s {{ tests }} -k "not ({{ skips }})" -m "not hypothesis" --durations=50  # [win]
 
         # regression test for https://github.com/conda-forge/pytorch-cpu-feedstock/issues/329, where we picked up
         # duplicate `.pyc` files due to newest py-ver (3.13) in the build environment not matching the one in host;
@@ -479,8 +546,13 @@ about:
   license_file:
     - LICENSE
     - NOTICE
-    - third_party/pybind11/LICENSE
   summary: PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
+  description: |
+    PyTorch is a Python package that provides two high-level features:
+      - Tensor computation (like NumPy) with strong GPU acceleration
+      - Deep neural networks built on a tape-based autograd system
+    You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed.
+  doc_url: https://pytorch.org/docs/
 
 extra:
   recipe-maintainers:
diff --git a/recipe/patches/0001-Force-usage-of-python-3-and-error-without-numpy.patch b/recipe/patches/0001-Force-usage-of-python-3-and-error-without-numpy.patch
index 4ce6492a..b5519b81 100644
--- a/recipe/patches/0001-Force-usage-of-python-3-and-error-without-numpy.patch
+++ b/recipe/patches/0001-Force-usage-of-python-3-and-error-without-numpy.patch
@@ -1,14 +1,14 @@
-From 756045fca376345e48afb6a868b502dbfa0c584c Mon Sep 17 00:00:00 2001
+From f3a0f9aab6dce56eea590b946f60256014b61bf7 Mon Sep 17 00:00:00 2001
 From: Mark Harfouche <mark.harfouche@gmail.com>
 Date: Sun, 1 Sep 2024 17:35:40 -0400
-Subject: [PATCH 01/16] Force usage of python 3 and error without numpy
+Subject: [PATCH 01/20] Force usage of python 3 and error without numpy
 
 ---
  cmake/Dependencies.cmake | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index e78305e0a..15c625486 100644
+index e78305e0a8e..15c62548601 100644
 --- a/cmake/Dependencies.cmake
 +++ b/cmake/Dependencies.cmake
 @@ -861,9 +861,9 @@ if(BUILD_PYTHON)
@@ -32,6 +32,3 @@ index e78305e0a..15c625486 100644
          caffe2_update_option(USE_NUMPY OFF)
        else()
          caffe2_update_option(USE_NUMPY ON)
--- 
-2.48.1
-
diff --git a/recipe/patches/0002-Help-find-numpy.patch b/recipe/patches/0002-Help-find-numpy.patch
index 6f3fa2c3..833af9f1 100644
--- a/recipe/patches/0002-Help-find-numpy.patch
+++ b/recipe/patches/0002-Help-find-numpy.patch
@@ -1,14 +1,14 @@
-From 70661ad52cb2f0290de3e0758f240560e4b1e769 Mon Sep 17 00:00:00 2001
+From 21c30036b5b86f403c0cf4426165d9a6a50edb1a Mon Sep 17 00:00:00 2001
 From: Mark Harfouche <mark.harfouche@gmail.com>
 Date: Tue, 1 Oct 2024 00:28:40 -0400
-Subject: [PATCH 02/16] Help find numpy
+Subject: [PATCH 02/20] Help find numpy
 
 ---
  tools/setup_helpers/cmake.py | 6 ++++++
  1 file changed, 6 insertions(+)
 
 diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
-index 4b605fe59..bde41323c 100644
+index 4b605fe5975..bde41323c76 100644
 --- a/tools/setup_helpers/cmake.py
 +++ b/tools/setup_helpers/cmake.py
 @@ -305,9 +305,15 @@ class CMake:
@@ -27,6 +27,3 @@ index 4b605fe59..bde41323c 100644
              TORCH_BUILD_VERSION=version,
              **build_options,
          )
--- 
-2.48.1
-
diff --git a/recipe/patches/0003-Add-USE_SYSTEM_NVTX-option-138287.patch b/recipe/patches/0003-Add-USE_SYSTEM_NVTX-option-138287.patch
index af8662e4..a4c44e01 100644
--- a/recipe/patches/0003-Add-USE_SYSTEM_NVTX-option-138287.patch
+++ b/recipe/patches/0003-Add-USE_SYSTEM_NVTX-option-138287.patch
@@ -1,7 +1,7 @@
-From 4ae61d17c81e9d66e091c2790ac6deae6bf31204 Mon Sep 17 00:00:00 2001
+From d1826af525db41eda5020a1404f5d5521d67a5dc Mon Sep 17 00:00:00 2001
 From: Jeongseok Lee <jeongseok@meta.com>
 Date: Sat, 19 Oct 2024 04:26:01 +0000
-Subject: [PATCH 03/16] Add USE_SYSTEM_NVTX option (#138287)
+Subject: [PATCH 03/20] Add USE_SYSTEM_NVTX option (#138287)
 
 ## Summary
 
@@ -21,7 +21,7 @@ Approved by: https://github.com/albanD
  3 files changed, 22 insertions(+), 2 deletions(-)
 
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 98593c2de..ae3c3f2cb 100644
+index 98593c2de97..ae3c3f2cbd5 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -470,6 +470,7 @@ option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
@@ -41,7 +41,7 @@ index 98593c2de..ae3c3f2cb 100644
  
  # /Z7 override option When generating debug symbols, CMake default to use the
 diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
-index afc1bc12a..152fbdbe6 100644
+index afc1bc12abf..152fbdbe6dd 100644
 --- a/cmake/public/cuda.cmake
 +++ b/cmake/public/cuda.cmake
 @@ -170,7 +170,11 @@ else()
@@ -58,7 +58,7 @@ index afc1bc12a..152fbdbe6 100644
  if(nvtx3_FOUND)
    add_library(torch::nvtx3 INTERFACE IMPORTED)
 diff --git a/setup.py b/setup.py
-index 2b0cfa99d..7174777ed 100644
+index 2b0cfa99d71..7174777ed4e 100644
 --- a/setup.py
 +++ b/setup.py
 @@ -183,7 +183,21 @@
@@ -84,6 +84,3 @@ index 2b0cfa99d..7174777ed 100644
  #
  #   USE_MIMALLOC
  #      Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
--- 
-2.48.1
-
diff --git a/recipe/patches/0004-Update-sympy-version.patch b/recipe/patches/0004-Update-sympy-version.patch
index 5dd72f7c..81a66b3f 100644
--- a/recipe/patches/0004-Update-sympy-version.patch
+++ b/recipe/patches/0004-Update-sympy-version.patch
@@ -1,14 +1,14 @@
-From 2c6db02c01ad080c8dc8ae0b78be2b93099c2ac8 Mon Sep 17 00:00:00 2001
+From e3219c5fe8834753b0cf9e92be4d1ef1e874f370 Mon Sep 17 00:00:00 2001
 From: Jeongseok Lee <jeongseok@meta.com>
 Date: Thu, 17 Oct 2024 15:04:05 -0700
-Subject: [PATCH 04/16] Update sympy version
+Subject: [PATCH 04/20] Update sympy version
 
 ---
  setup.py | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/setup.py b/setup.py
-index 7174777ed..65be34e39 100644
+index 7174777ed4e..65be34e39b1 100644
 --- a/setup.py
 +++ b/setup.py
 @@ -1158,7 +1158,7 @@ def main():
@@ -20,6 +20,3 @@ index 7174777ed..65be34e39 100644
          "networkx",
          "jinja2",
          "fsspec",
--- 
-2.48.1
-
diff --git a/recipe/patches/0005-Fix-duplicate-linker-script.patch b/recipe/patches/0005-Fix-duplicate-linker-script.patch
index 7cc82435..cb09dcdf 100644
--- a/recipe/patches/0005-Fix-duplicate-linker-script.patch
+++ b/recipe/patches/0005-Fix-duplicate-linker-script.patch
@@ -1,14 +1,14 @@
-From fa5bb8f1acd0195efadc35c8fbb9199be92932d9 Mon Sep 17 00:00:00 2001
+From 08a1f44fbc81324aa98d720dfb7b87a261923ac2 Mon Sep 17 00:00:00 2001
 From: Jeongseok Lee <jeongseok@meta.com>
 Date: Sun, 3 Nov 2024 01:12:36 -0700
-Subject: [PATCH 05/16] Fix duplicate linker script
+Subject: [PATCH 05/20] Fix duplicate linker script
 
 ---
  setup.py | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/setup.py b/setup.py
-index 65be34e39..b0e01e0d1 100644
+index 65be34e39b1..b0e01e0d1ee 100644
 --- a/setup.py
 +++ b/setup.py
 @@ -1184,7 +1184,9 @@ def main():
@@ -22,6 +22,3 @@ index 65be34e39..b0e01e0d1 100644
          os.environ["CFLAGS"] = (
              os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
          )
--- 
-2.48.1
-
diff --git a/recipe/patches/0006-fix-3.13-pickle-error-in-serialization.py-136034.patch b/recipe/patches/0006-fix-3.13-pickle-error-in-serialization.py-136034.patch
index cddb8b68..326e6285 100644
--- a/recipe/patches/0006-fix-3.13-pickle-error-in-serialization.py-136034.patch
+++ b/recipe/patches/0006-fix-3.13-pickle-error-in-serialization.py-136034.patch
@@ -1,7 +1,7 @@
-From 6fc695312cd062e13c2482b52ae8d028bd7c043a Mon Sep 17 00:00:00 2001
+From 15df314a41c69a31c0443254d5552aa1b39d708d Mon Sep 17 00:00:00 2001
 From: William Wen <williamwen@meta.com>
 Date: Fri, 13 Sep 2024 13:02:33 -0700
-Subject: [PATCH 06/16] fix 3.13 pickle error in serialization.py (#136034)
+Subject: [PATCH 06/20] fix 3.13 pickle error in serialization.py (#136034)
 
 Error encountered when adding dynamo 3.13 support.
 Pull Request resolved: https://github.com/pytorch/pytorch/pull/136034
@@ -11,7 +11,7 @@ Approved by: https://github.com/albanD
  1 file changed, 12 insertions(+), 4 deletions(-)
 
 diff --git a/torch/serialization.py b/torch/serialization.py
-index d936d31d6..d937680c0 100644
+index d936d31d6f5..d937680c031 100644
 --- a/torch/serialization.py
 +++ b/torch/serialization.py
 @@ -1005,8 +1005,12 @@ def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None:
@@ -44,6 +44,3 @@ index d936d31d6..d937680c0 100644
      pickler.dump(obj)
      data_value = data_buf.getvalue()
      zip_file.write_record("data.pkl", data_value, len(data_value))
--- 
-2.48.1
-
diff --git a/recipe/patches/0007-Allow-users-to-overwrite-ld-with-environment-variabl.patch b/recipe/patches/0007-Allow-users-to-overwrite-ld-with-environment-variabl.patch
index b847ba1a..ad215aa9 100644
--- a/recipe/patches/0007-Allow-users-to-overwrite-ld-with-environment-variabl.patch
+++ b/recipe/patches/0007-Allow-users-to-overwrite-ld-with-environment-variabl.patch
@@ -1,7 +1,7 @@
-From d5c8df70422afa07dc212266d420f923f5887f99 Mon Sep 17 00:00:00 2001
+From 655f694854c3eafdd631235b60bc6c1b279218ed Mon Sep 17 00:00:00 2001
 From: Mark Harfouche <mark.harfouche@gmail.com>
 Date: Thu, 3 Oct 2024 22:49:56 -0400
-Subject: [PATCH 07/16] Allow users to overwrite ld with environment variables
+Subject: [PATCH 07/20] Allow users to overwrite ld with environment variables
 
 This should help in the case of cross compilation.
 
@@ -11,7 +11,7 @@ xref: https://github.com/conda-forge/pytorch-cpu-feedstock/pull/261
  1 file changed, 3 insertions(+), 2 deletions(-)
 
 diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py
-index 11c397a9e..e66fc1970 100644
+index 11c397a9e5f..e66fc197062 100644
 --- a/tools/setup_helpers/generate_linker_script.py
 +++ b/tools/setup_helpers/generate_linker_script.py
 @@ -1,3 +1,4 @@
@@ -30,6 +30,3 @@ index 11c397a9e..e66fc1970 100644
          "\n"
      )
  
--- 
-2.48.1
-
diff --git a/recipe/patches/0008-Allow-overriding-CUDA-related-paths.patch b/recipe/patches/0008-Allow-overriding-CUDA-related-paths.patch
index 272d200c..fbfe0560 100644
--- a/recipe/patches/0008-Allow-overriding-CUDA-related-paths.patch
+++ b/recipe/patches/0008-Allow-overriding-CUDA-related-paths.patch
@@ -1,7 +1,7 @@
-From da7b07f8e3165bf89b08b5a716e539ae9a7afb1a Mon Sep 17 00:00:00 2001
+From f03bf82d9da9cccb2cf4d4833c1a6349622dc37d Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
 Date: Wed, 27 Nov 2024 13:47:23 +0100
-Subject: [PATCH 08/16] Allow overriding CUDA-related paths
+Subject: [PATCH 08/20] Allow overriding CUDA-related paths
 
 ---
  cmake/Modules/FindCUDAToolkit.cmake | 2 +-
@@ -9,7 +9,7 @@ Subject: [PATCH 08/16] Allow overriding CUDA-related paths
  2 files changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/cmake/Modules/FindCUDAToolkit.cmake b/cmake/Modules/FindCUDAToolkit.cmake
-index ec9ae530a..b7c0bd9fc 100644
+index ec9ae530aa6..b7c0bd9fc51 100644
 --- a/cmake/Modules/FindCUDAToolkit.cmake
 +++ b/cmake/Modules/FindCUDAToolkit.cmake
 @@ -497,7 +497,7 @@ Result variables
@@ -22,7 +22,7 @@ index ec9ae530a..b7c0bd9fc 100644
    set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_LIBRARY_ROOT}")
    set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}")
 diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
-index bde41323c..b171837cd 100644
+index bde41323c76..b171837cd4a 100644
 --- a/tools/setup_helpers/cmake.py
 +++ b/tools/setup_helpers/cmake.py
 @@ -252,7 +252,7 @@ class CMake:
@@ -34,6 +34,3 @@ index bde41323c..b171837cd 100644
                  ("EXITCODE", "EXITCODE__TRYRUN_OUTPUT")
              ):
                  build_options[var] = val
--- 
-2.48.1
-
diff --git a/recipe/patches/0009-Fix-test-test_linalg.py-for-NumPy-2-136800.patch b/recipe/patches/0009-Fix-test-test_linalg.py-for-NumPy-2-136800.patch
index e1befef6..580fe42a 100644
--- a/recipe/patches/0009-Fix-test-test_linalg.py-for-NumPy-2-136800.patch
+++ b/recipe/patches/0009-Fix-test-test_linalg.py-for-NumPy-2-136800.patch
@@ -1,7 +1,7 @@
-From 3429795de33cac2e508397dd2d9f5f5c96f185c3 Mon Sep 17 00:00:00 2001
+From 4b1faf6ba142953ce2730766db44f8d98d161ef0 Mon Sep 17 00:00:00 2001
 From: Haifeng Jin <haifeng-jin@users.noreply.github.com>
 Date: Tue, 1 Oct 2024 07:53:24 +0000
-Subject: [PATCH 09/16] Fix test/test_linalg.py for NumPy 2 (#136800)
+Subject: [PATCH 09/20] Fix test/test_linalg.py for NumPy 2 (#136800)
 
 Related to  #107302.
 
@@ -36,7 +36,7 @@ Approved by: https://github.com/lezcano
  1 file changed, 12 insertions(+), 3 deletions(-)
 
 diff --git a/test/test_linalg.py b/test/test_linalg.py
-index e9ec874d6..060bccef2 100644
+index e9ec874d695..060bccef2e5 100644
 --- a/test/test_linalg.py
 +++ b/test/test_linalg.py
 @@ -2351,7 +2351,7 @@ class TestLinalg(TestCase):
@@ -75,6 +75,3 @@ index e9ec874d6..060bccef2 100644
                      reflectors_i[:] = reflectors_tmp.T
                  reflectors = reflectors.view(*A_cpu.shape)
                  tau = tau.view(tau_shape)
--- 
-2.48.1
-
diff --git a/recipe/patches/0010-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch b/recipe/patches/0010-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch
index bd5aa553..6495b150 100644
--- a/recipe/patches/0010-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch
+++ b/recipe/patches/0010-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch
@@ -1,7 +1,7 @@
-From a8ddbe6b682347fdc86c5052b244df4f95b926ac Mon Sep 17 00:00:00 2001
+From 032b9be9ca7f9ae174e75554cecc82600ea3ef54 Mon Sep 17 00:00:00 2001
 From: Haifeng Jin <haifeng-jin@users.noreply.github.com>
 Date: Sat, 12 Oct 2024 02:40:17 +0000
-Subject: [PATCH 10/16] Fixes NumPy 2 test failures in test_torch.py (#137740)
+Subject: [PATCH 10/20] Fixes NumPy 2 test failures in test_torch.py (#137740)
 
 Related to #107302
 
@@ -24,7 +24,7 @@ Approved by: https://github.com/ezyang
  1 file changed, 4 insertions(+), 4 deletions(-)
 
 diff --git a/test/test_torch.py b/test/test_torch.py
-index be4d61808..c6fd6ac9f 100644
+index be4d6180819..c6fd6ac9f19 100644
 --- a/test/test_torch.py
 +++ b/test/test_torch.py
 @@ -2891,7 +2891,7 @@ else:
@@ -58,6 +58,3 @@ index be4d61808..c6fd6ac9f 100644
          )
  
      @skipIfTorchDynamo("np.float64 restored as float32 after graph break.")
--- 
-2.48.1
-
diff --git a/recipe/patches/0011-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch b/recipe/patches/0011-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
index 2d9b1995..193ce159 100644
--- a/recipe/patches/0011-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
+++ b/recipe/patches/0011-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
@@ -1,7 +1,7 @@
-From 113c9ebec11cba2f1d43bfd4ac03eb02c5c921a8 Mon Sep 17 00:00:00 2001
+From 56f1528fa072023fb2724d5abf8790f2f6cc3aaa Mon Sep 17 00:00:00 2001
 From: Isuru Fernando <ifernando@quansight.com>
 Date: Wed, 18 Dec 2024 03:59:00 +0000
-Subject: [PATCH 11/16] Use BLAS_USE_CBLAS_DOT for OpenBLAS builds
+Subject: [PATCH 11/20] Use BLAS_USE_CBLAS_DOT for OpenBLAS builds
 
 There are two calling conventions for *dotu functions
 
@@ -31,7 +31,7 @@ functional calls.
  1 file changed, 1 insertion(+)
 
 diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 15c625486..3965416eb 100644
+index 15c62548601..3965416eb29 100644
 --- a/cmake/Dependencies.cmake
 +++ b/cmake/Dependencies.cmake
 @@ -182,6 +182,7 @@ elseif(BLAS STREQUAL "OpenBLAS")
@@ -42,6 +42,3 @@ index 15c625486..3965416eb 100644
  elseif(BLAS STREQUAL "BLIS")
    find_package(BLIS REQUIRED)
    include_directories(SYSTEM ${BLIS_INCLUDE_DIR})
--- 
-2.48.1
-
diff --git a/recipe/patches/0012-fix-issue-142484.patch b/recipe/patches/0012-fix-issue-142484.patch
index bb4a2e6e..00f1e3d2 100644
--- a/recipe/patches/0012-fix-issue-142484.patch
+++ b/recipe/patches/0012-fix-issue-142484.patch
@@ -1,7 +1,7 @@
-From 323bb15a6b1f601d79211bd292c26cb886a5d60e Mon Sep 17 00:00:00 2001
+From beba58d724cc1bd7ca73660b0a5ad9e61ae0c562 Mon Sep 17 00:00:00 2001
 From: "Zheng, Zhaoqiong" <zhaoqiong.zheng@intel.com>
 Date: Fri, 27 Dec 2024 13:49:36 +0800
-Subject: [PATCH 12/16] fix issue 142484
+Subject: [PATCH 12/20] fix issue 142484
 
 From https://github.com/pytorch/pytorch/pull/143894
 ---
@@ -9,7 +9,7 @@ From https://github.com/pytorch/pytorch/pull/143894
  1 file changed, 11 insertions(+), 1 deletion(-)
 
 diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
-index e26cfbf6d..c61b76d32 100644
+index e26cfbf6d8e..c61b76d3205 100644
 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp
 +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
 @@ -477,7 +477,17 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
@@ -31,6 +31,3 @@ index e26cfbf6d..c61b76d32 100644
    auto descriptor = _plan_mkl_fft(
        input.strides(), out.strides(), signal_size, input.is_complex(),
        out.is_complex(), normalization, forward, value_type);
--- 
-2.48.1
-
diff --git a/recipe/patches/0013-Fix-FindOpenBLAS.patch b/recipe/patches/0013-Fix-FindOpenBLAS.patch
index 47e34885..f539d0a6 100644
--- a/recipe/patches/0013-Fix-FindOpenBLAS.patch
+++ b/recipe/patches/0013-Fix-FindOpenBLAS.patch
@@ -1,14 +1,14 @@
-From 4ca7ade3211380629ab56f3c965edd1b6387d1e0 Mon Sep 17 00:00:00 2001
+From 816a248a4425a97350959e412666e6db9012a52e Mon Sep 17 00:00:00 2001
 From: Bas Zalmstra <bas@prefix.dev>
 Date: Thu, 16 May 2024 10:46:49 +0200
-Subject: [PATCH 13/16] Fix FindOpenBLAS
+Subject: [PATCH 13/20] Fix FindOpenBLAS
 
 ---
  cmake/Modules/FindOpenBLAS.cmake | 15 +++++++++------
  1 file changed, 9 insertions(+), 6 deletions(-)
 
 diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
-index 69d8227ae..0d12185c7 100644
+index 69d8227aea5..0d12185c799 100644
 --- a/cmake/Modules/FindOpenBLAS.cmake
 +++ b/cmake/Modules/FindOpenBLAS.cmake
 @@ -31,22 +31,25 @@ SET(Open_BLAS_LIB_SEARCH_PATHS
@@ -43,6 +43,3 @@ index 69d8227ae..0d12185c7 100644
  
  IF (OpenBLAS_FOUND)
    IF (NOT OpenBLAS_FIND_QUIETLY)
--- 
-2.48.1
-
diff --git a/recipe/patches/0014-CD-Enable-Python-3.13-on-windows-138095.patch b/recipe/patches/0014-CD-Enable-Python-3.13-on-windows-138095.patch
index 031fce6d..7a2df88f 100644
--- a/recipe/patches/0014-CD-Enable-Python-3.13-on-windows-138095.patch
+++ b/recipe/patches/0014-CD-Enable-Python-3.13-on-windows-138095.patch
@@ -1,7 +1,7 @@
-From 3b32a078793f06e80d88c356871953f254d4d6c3 Mon Sep 17 00:00:00 2001
+From db896f927403f55a18f931b18a6469cb4e37d322 Mon Sep 17 00:00:00 2001
 From: atalman <atalman@fb.com>
 Date: Tue, 12 Nov 2024 12:28:10 +0000
-Subject: [PATCH 14/16] CD Enable Python 3.13 on windows (#138095)
+Subject: [PATCH 14/20] CD Enable Python 3.13 on windows (#138095)
 
 Adding CD windows. Part of: https://github.com/pytorch/pytorch/issues/130249
 Builder PR landed with smoke test: https://github.com/pytorch/builder/pull/2035
@@ -16,7 +16,7 @@ Cherry-pick-note: minus changes in `.github/*`
  2 files changed, 13 insertions(+), 1 deletion(-)
 
 diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
-index 722618efb..f98818bfd 100644
+index 722618efbb0..f98818bfdcc 100644
 --- a/functorch/csrc/dim/dim.cpp
 +++ b/functorch/csrc/dim/dim.cpp
 @@ -38,6 +38,7 @@ PyObject* Dim_init() {
@@ -28,7 +28,7 @@ index 722618efb..f98818bfd 100644
  #include "internal/pycore_opcode.h"
  #undef Py_BUILD_CORE
 diff --git a/functorch/csrc/dim/dim_opcode.c b/functorch/csrc/dim/dim_opcode.c
-index 81ba62a37..1b5d06773 100644
+index 81ba62a3781..1b5d0677344 100644
 --- a/functorch/csrc/dim/dim_opcode.c
 +++ b/functorch/csrc/dim/dim_opcode.c
 @@ -1,6 +1,17 @@
@@ -50,6 +50,3 @@ index 81ba62a37..1b5d06773 100644
 +#undef NEED_OPCODE_TABLES
 +#undef Py_BUILD_CORE
 +#endif
--- 
-2.48.1
-
diff --git a/recipe/patches/0015-simplify-torch.utils.cpp_extension.include_paths-use.patch b/recipe/patches/0015-simplify-torch.utils.cpp_extension.include_paths-use.patch
index e8ff9e59..3736ca78 100644
--- a/recipe/patches/0015-simplify-torch.utils.cpp_extension.include_paths-use.patch
+++ b/recipe/patches/0015-simplify-torch.utils.cpp_extension.include_paths-use.patch
@@ -1,7 +1,7 @@
-From 4465b713563855e7eb5475758226f3a90f675f55 Mon Sep 17 00:00:00 2001
+From 33790dfbf966e7d8ea4ff6798d2ff92474d84079 Mon Sep 17 00:00:00 2001
 From: "H. Vetinari" <h.vetinari@gmx.com>
 Date: Thu, 23 Jan 2025 22:46:58 +1100
-Subject: [PATCH 15/16] simplify torch.utils.cpp_extension.include_paths; use
+Subject: [PATCH 15/20] simplify torch.utils.cpp_extension.include_paths; use
  it in cpp_builder
 
 The /TH headers have not existed since pytorch 1.11
@@ -11,7 +11,7 @@ The /TH headers have not existed since pytorch 1.11
  2 files changed, 3 insertions(+), 14 deletions(-)
 
 diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
-index 95a0bff86..860e7fb06 100644
+index 95a0bff86fd..860e7fb062f 100644
 --- a/torch/_inductor/cpp_builder.py
 +++ b/torch/_inductor/cpp_builder.py
 @@ -743,16 +743,9 @@ def _get_build_args_of_chosen_isa(vec_isa: VecISA) -> Tuple[List[str], List[str]
@@ -35,7 +35,7 @@ index 95a0bff86..860e7fb06 100644
      libraries = []
      if sys.platform != "darwin" and not config.is_fbcode():
 diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
-index aaa45ea4c..3f584ef55 100644
+index aaa45ea4c90..3f584ef5598 100644
 --- a/torch/utils/cpp_extension.py
 +++ b/torch/utils/cpp_extension.py
 @@ -1159,10 +1159,6 @@ def include_paths(cuda: bool = False) -> List[str]:
@@ -49,6 +49,3 @@ index aaa45ea4c..3f584ef55 100644
      ]
      if cuda and IS_HIP_EXTENSION:
          paths.append(os.path.join(lib_include, 'THH'))
--- 
-2.48.1
-
diff --git a/recipe/patches/0016-point-include-paths-to-PREFIX-include.patch b/recipe/patches/0016-point-include-paths-to-PREFIX-include.patch
index fecf4d0f..764e24af 100644
--- a/recipe/patches/0016-point-include-paths-to-PREFIX-include.patch
+++ b/recipe/patches/0016-point-include-paths-to-PREFIX-include.patch
@@ -1,14 +1,14 @@
-From 4d485fc0a5e3226e528e9dab17b184ff9835a045 Mon Sep 17 00:00:00 2001
+From 799f6fa59dac93dabbbcf72d46f4e1334e3d65d9 Mon Sep 17 00:00:00 2001
 From: "H. Vetinari" <h.vetinari@gmx.com>
 Date: Thu, 23 Jan 2025 22:58:14 +1100
-Subject: [PATCH 16/16] point include paths to $PREFIX/include
+Subject: [PATCH 16/20] point include paths to $PREFIX/include
 
 ---
  torch/utils/cpp_extension.py | 9 +++++++++
  1 file changed, 9 insertions(+)
 
 diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
-index 3f584ef55..4210f62b6 100644
+index 3f584ef5598..4210f62b6db 100644
 --- a/torch/utils/cpp_extension.py
 +++ b/torch/utils/cpp_extension.py
 @@ -1155,10 +1155,19 @@ def include_paths(cuda: bool = False) -> List[str]:
@@ -31,6 +31,3 @@ index 3f584ef55..4210f62b6 100644
      ]
      if cuda and IS_HIP_EXTENSION:
          paths.append(os.path.join(lib_include, 'THH'))
--- 
-2.48.1
-
diff --git a/recipe/patches/0017-Add-conda-prefix-to-inductor-include-paths.patch b/recipe/patches/0017-Add-conda-prefix-to-inductor-include-paths.patch
new file mode 100644
index 00000000..e2111c54
--- /dev/null
+++ b/recipe/patches/0017-Add-conda-prefix-to-inductor-include-paths.patch
@@ -0,0 +1,27 @@
+From 9f73a02bacf9680833ac64657fde6762d33ab200 Mon Sep 17 00:00:00 2001
+From: Daniel Petry <dpetry@anaconda.com>
+Date: Tue, 21 Jan 2025 17:45:23 -0600
+Subject: [PATCH 17/20] Add conda prefix to inductor include paths
+
+Currently inductor doesn't look in conda's includes and libs. This results in
+errors when it tries to compile, if system versions are being used of
+dependencies (e.g., sleef).
+
+Note that this is for inductor's JIT mode, not its AOT mode, for which the
+end user provides a <filename>_compile_flags.json file.
+---
+ torch/_inductor/cpp_builder.py | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
+index 860e7fb062f..76c61375d91 100644
+--- a/torch/_inductor/cpp_builder.py
++++ b/torch/_inductor/cpp_builder.py
+@@ -1048,6 +1048,7 @@ def get_cpp_torch_options(
+         + python_include_dirs
+         + torch_include_dirs
+         + omp_include_dir_paths
++        + [os.getenv('CONDA_PREFIX') + '/include']
+     )
+     cflags = sys_libs_cflags + omp_cflags
+     ldflags = omp_ldflags
diff --git a/recipe/patches/0018-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch b/recipe/patches/0018-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
new file mode 100644
index 00000000..028d79be
--- /dev/null
+++ b/recipe/patches/0018-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
@@ -0,0 +1,25 @@
+From b0cfa0f728e96a3a9d6f7434e2c02d74d6daa9a9 Mon Sep 17 00:00:00 2001
+From: "H. Vetinari" <h.vetinari@gmx.com>
+Date: Tue, 28 Jan 2025 14:15:34 +1100
+Subject: [PATCH 18/20] make ATEN_INCLUDE_DIR relative to TORCH_INSTALL_PREFIX
+
+we cannot set CMAKE_INSTALL_PREFIX without the pytorch build complaining, but we can
+use TORCH_INSTALL_PREFIX, which is set correctly relative to our CMake files already:
+https://github.com/pytorch/pytorch/blob/v2.5.1/cmake/TorchConfig.cmake.in#L47
+---
+ aten/src/ATen/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index 6d9152a4d07..aa4dd7b05cc 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -563,7 +563,7 @@ if(USE_ROCM)
+   # list(APPEND ATen_HIP_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB)
+ endif()
+ 
+-set(ATEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}")
++set(ATEN_INCLUDE_DIR "${TORCH_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}")
+ configure_file(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake")
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
+   DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen")
diff --git a/recipe/patches/0019-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch b/recipe/patches/0019-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch
new file mode 100644
index 00000000..7aa41192
--- /dev/null
+++ b/recipe/patches/0019-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch
@@ -0,0 +1,158 @@
+From f7db4cbfb0af59027ed8bdcd0387dba6fbcb1192 Mon Sep 17 00:00:00 2001
+From: "H. Vetinari" <h.vetinari@gmx.com>
+Date: Tue, 28 Jan 2025 10:58:29 +1100
+Subject: [PATCH 19/20] remove `DESTINATION lib` from CMake `install(TARGETS`
+ directives
+
+Suggested-By: Silvio Traversaro <silvio@traversaro.it>
+---
+ c10/CMakeLists.txt                      |  2 +-
+ c10/cuda/CMakeLists.txt                 |  2 +-
+ c10/hip/CMakeLists.txt                  |  2 +-
+ c10/xpu/CMakeLists.txt                  |  2 +-
+ caffe2/CMakeLists.txt                   | 18 +++++++++---------
+ torch/CMakeLists.txt                    |  2 +-
+ torch/lib/libshm_windows/CMakeLists.txt |  2 +-
+ 7 files changed, 15 insertions(+), 15 deletions(-)
+
+diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
+index 80e172497d5..d7f8987020d 100644
+--- a/c10/CMakeLists.txt
++++ b/c10/CMakeLists.txt
+@@ -162,7 +162,7 @@ if(NOT BUILD_LIBTORCHLESS)
+   # Note: for now, we will put all export path into one single Caffe2Targets group
+   # to deal with the cmake deployment need. Inside the Caffe2Targets set, the
+   # individual libraries like libc10.so and libcaffe2.so are still self-contained.
+-  install(TARGETS c10 EXPORT Caffe2Targets DESTINATION lib)
++  install(TARGETS c10 EXPORT Caffe2Targets)
+ endif()
+ 
+ install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt
+index 3327dab4779..9336c9e8f77 100644
+--- a/c10/cuda/CMakeLists.txt
++++ b/c10/cuda/CMakeLists.txt
+@@ -82,7 +82,7 @@ if(NOT BUILD_LIBTORCHLESS)
+ # Note: for now, we will put all export path into one single Caffe2Targets group
+ # to deal with the cmake deployment need. Inside the Caffe2Targets set, the
+ # individual libraries like libc10.so and libcaffe2.so are still self-contained.
+-install(TARGETS c10_cuda EXPORT Caffe2Targets DESTINATION lib)
++install(TARGETS c10_cuda EXPORT Caffe2Targets)
+ 
+ endif()
+ 
+diff --git a/c10/hip/CMakeLists.txt b/c10/hip/CMakeLists.txt
+index f153030e793..514c6d29266 100644
+--- a/c10/hip/CMakeLists.txt
++++ b/c10/hip/CMakeLists.txt
+@@ -55,7 +55,7 @@ if(NOT BUILD_LIBTORCHLESS)
+       $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../..>
+       $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+       $<INSTALL_INTERFACE:include>)
+-  install(TARGETS c10_hip EXPORT Caffe2Targets DESTINATION lib)
++  install(TARGETS c10_hip EXPORT Caffe2Targets)
+   set(C10_HIP_LIB c10_hip)
+ endif()
+ 
+diff --git a/c10/xpu/CMakeLists.txt b/c10/xpu/CMakeLists.txt
+index 01f77d61713..437ade657f9 100644
+--- a/c10/xpu/CMakeLists.txt
++++ b/c10/xpu/CMakeLists.txt
+@@ -45,7 +45,7 @@ if(NOT BUILD_LIBTORCHLESS)
+       $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+       $<INSTALL_INTERFACE:include>
+       )
+-  install(TARGETS c10_xpu EXPORT Caffe2Targets DESTINATION lib)
++  install(TARGETS c10_xpu EXPORT Caffe2Targets)
+   set(C10_XPU_LIB c10_xpu)
+   add_subdirectory(test)
+ endif()
+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
+index 9be7f3732f3..b51c7cc637b 100644
+--- a/caffe2/CMakeLists.txt
++++ b/caffe2/CMakeLists.txt
+@@ -549,7 +549,7 @@ if(USE_CUDA)
+   endif()
+ 
+   target_link_libraries(caffe2_nvrtc PRIVATE caffe2::nvrtc ${DELAY_LOAD_FLAGS})
+-  install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++  install(TARGETS caffe2_nvrtc)
+   if(USE_NCCL)
+     list(APPEND Caffe2_GPU_SRCS
+       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
+@@ -609,7 +609,7 @@ if(USE_ROCM)
+   target_link_libraries(caffe2_nvrtc ${PYTORCH_HIP_LIBRARIES} ${ROCM_HIPRTC_LIB})
+   target_include_directories(caffe2_nvrtc PRIVATE ${CMAKE_BINARY_DIR})
+   target_compile_definitions(caffe2_nvrtc PRIVATE USE_ROCM __HIP_PLATFORM_AMD__)
+-  install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++  install(TARGETS caffe2_nvrtc)
+ endif()
+ 
+ if(NOT NO_API AND NOT BUILD_LITE_INTERPRETER)
+@@ -995,7 +995,7 @@ elseif(USE_CUDA)
+           CUDA::culibos ${CMAKE_DL_LIBS})
+     endif()
+     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp PROPERTIES COMPILE_FLAGS "-DBUILD_LAZY_CUDA_LINALG")
+-    install(TARGETS torch_cuda_linalg DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++    install(TARGETS torch_cuda_linalg)
+   endif()
+ 
+   if(USE_PRECOMPILED_HEADERS)
+@@ -1467,17 +1467,17 @@ endif()
+ 
+ caffe2_interface_library(torch torch_library)
+ 
+-install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets)
+ 
+ if(USE_CUDA)
+-  install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++  install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets)
+ elseif(USE_ROCM)
+-  install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++  install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets)
+ elseif(USE_XPU)
+-  install(TARGETS torch_xpu torch_xpu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++  install(TARGETS torch_xpu torch_xpu_library EXPORT Caffe2Targets)
+ endif()
+ 
+-install(TARGETS torch torch_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++install(TARGETS torch torch_library EXPORT Caffe2Targets)
+ 
+ target_link_libraries(torch PUBLIC torch_cpu_library)
+ 
+@@ -1616,7 +1616,7 @@ if(BUILD_SHARED_LIBS)
+       target_link_libraries(torch_global_deps torch::nvtoolsext)
+     endif()
+   endif()
+-  install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++  install(TARGETS torch_global_deps)
+ endif()
+ 
+ # ---[ Caffe2 HIP sources.
+diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
+index c74b45431c9..80fb5e7734e 100644
+--- a/torch/CMakeLists.txt
++++ b/torch/CMakeLists.txt
+@@ -447,7 +447,7 @@ if(NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
+     set_target_properties(torch_python PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
+ endif()
+ 
+-install(TARGETS torch_python DESTINATION "${TORCH_INSTALL_LIB_DIR}")
++install(TARGETS torch_python)
+ 
+ # Generate torch/version.py from the appropriate CMake cache variables.
+ if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+diff --git a/torch/lib/libshm_windows/CMakeLists.txt b/torch/lib/libshm_windows/CMakeLists.txt
+index df2a1064938..5fa15e6be31 100644
+--- a/torch/lib/libshm_windows/CMakeLists.txt
++++ b/torch/lib/libshm_windows/CMakeLists.txt
+@@ -19,7 +19,7 @@ target_include_directories(shm PRIVATE
+ target_link_libraries(shm torch c10)
+ 
+ 
+-install(TARGETS shm DESTINATION "${LIBSHM_INSTALL_LIB_SUBDIR}")
++install(TARGETS shm)
+ install(FILES libshm.h DESTINATION "include")
+ 
+ if(MSVC AND BUILD_SHARED_LIBS)
diff --git a/recipe/patches/0020-inductor-Enable-cpp-wrapper-for-test_torchinductor-1.patch b/recipe/patches/0020-inductor-Enable-cpp-wrapper-for-test_torchinductor-1.patch
new file mode 100644
index 00000000..68753e5c
--- /dev/null
+++ b/recipe/patches/0020-inductor-Enable-cpp-wrapper-for-test_torchinductor-1.patch
@@ -0,0 +1,272 @@
+From c06d20d68d0190967494c08df93207828af71628 Mon Sep 17 00:00:00 2001
+From: Bin Bao <binbao@meta.com>
+Date: Mon, 28 Oct 2024 07:44:46 -0700
+Subject: [PATCH 20/20] [inductor] Enable cpp wrapper for test_torchinductor
+ (#138579)
+
+Summary: Expand cpp wrapper testing to test_torchinductor. Using skip_cpp_wrapper to skip failing tests for now, and fixes are coming later.
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/138579
+Approved by: https://github.com/chenyang78, https://github.com/benjaminglass1
+
+[Cherry-pick note: dropped changes in .ci/pytorch/test.sh]
+---
+ test/inductor/test_torchinductor.py | 47 +++++++++++++++++++++++++++--
+ 1 file changed, 44 insertions(+), 3 deletions(-)
+
+diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
+index 610f5d27332..5c3c50e5a70 100644
+--- a/test/inductor/test_torchinductor.py
++++ b/test/inductor/test_torchinductor.py
+@@ -689,7 +689,9 @@ def assertGeneratedKernelCountEqual(self: TestCase, expected: int):
+         # and non-persistent reduction kernels for the same node schedule.
+         # That will mess up with the kernel count. Just don't check it.
+         return
+-    if config.cpp_wrapper:
++    if config.cpp_wrapper and self.device != "cpu":
++        # FIXME: cpp wrapper codegen for cuda is done in two passes. Update
++        # this once we move to the new one-pass solution.
+         expected *= 2
+     self.assertEqual(torch._inductor.metrics.generated_kernel_count, expected)
+ 
+@@ -767,6 +769,16 @@ def skip_if_gpu_halide(fn):
+     return wrapper
+ 
+ 
++def skip_if_cpp_wrapper(fn):
++    @functools.wraps(fn)
++    def wrapper(self):
++        if config.cpp_wrapper:
++            raise unittest.SkipTest("cpp wrapper bug to be fixed")
++        return fn(self)
++
++    return wrapper
++
++
+ @instantiate_parametrized_tests
+ class CommonTemplate:
+     def test_bool(self):
+@@ -1362,6 +1374,7 @@ class CommonTemplate:
+ 
+     @config.patch({"fx_graph_cache": False})
+     @skipIfWindows(msg="torch._dynamo.exc.Unsupported")
++    @skip_if_cpp_wrapper
+     def test_forced_buffer_realize(self):
+         # Test torch._test_inductor_realize forces a buffer to be realized
+         def fn(a):
+@@ -1373,6 +1386,7 @@ class CommonTemplate:
+ 
+     @config.patch({"fx_graph_cache": False})
+     @skipIfWindows(msg="torch._dynamo.exc.Unsupported")
++    @skip_if_cpp_wrapper
+     def test_scheduler_vertical_fusion1(self):
+         realize = test_operators.realize
+ 
+@@ -2966,6 +2980,7 @@ class CommonTemplate:
+         self.common(fn, (torch.randn(8, 8), torch.randn(8, 8)))
+ 
+     @skip_if_halide  # only 32-bit indexing
++    @skip_if_cpp_wrapper  # OOM
+     def test_large_tensor_reduction(self):
+         if not _has_sufficient_memory(self.device, 4.5 * 1024**3):  # 4.5 GiB
+             raise unittest.SkipTest("insufficient memory")
+@@ -2987,6 +3002,7 @@ class CommonTemplate:
+         self.assertEqual(actual, expect)
+ 
+     @skip_if_gpu_halide  # only 32-bit indexing
++    @skip_if_cpp_wrapper  # OOM
+     def test_large_broadcast_reduction(self):
+         if self.device == "cpu":
+             raise unittest.SkipTest("Fails on CPU")
+@@ -3009,6 +3025,7 @@ class CommonTemplate:
+         self.assertEqual(actual, expect)
+ 
+     @skip_if_halide  # only 32-bit indexing
++    @skip_if_cpp_wrapper  # OOM
+     def test_large_pointwise(self):
+         if not _has_sufficient_memory(self.device, 2 * (2**31 + 1)):
+             raise unittest.SkipTest("insufficient memory")
+@@ -3045,6 +3062,7 @@ class CommonTemplate:
+         self.assertTrue((actual == 4).all())
+ 
+     @skip_if_halide  # only 32-bit indexing
++    @skip_if_cpp_wrapper  # OOM
+     def test_large_strided_reduction(self):
+         # Test 64-bit indexing is used when input numel is less than INT_MAX
+         # but stride calculations go above INT_MAX
+@@ -3317,6 +3335,7 @@ class CommonTemplate:
+         )
+ 
+     @with_tf32_off
++    @skip_if_cpp_wrapper
+     @config.patch(use_mixed_mm=True)
+     def test_uint4x2_mixed_mm(self):
+         def fn(a, b):
+@@ -3346,10 +3365,12 @@ class CommonTemplate:
+         t2 = torch.arange(9, dtype=torch.int64, device=self.device).view(3, 3)
+ 
+         msg = "expected .* and .* to have the same dtype, but got: .* != .*"
+-        with self.assertRaisesRegex(RuntimeError, msg):
+-            torch.compile(fn)(t1, t2)
+         with self.assertRaisesRegex(RuntimeError, msg):
+             fn(t1, t2)
++        if config.cpp_wrapper:
++            msg = "aoti_torch_.* API call failed at .*"
++        with self.assertRaisesRegex(RuntimeError, msg):
++            torch.compile(fn)(t1, t2)
+ 
+     @skipIfXpu
+     def test_linear_mixed_dtype(self):
+@@ -3368,6 +3389,8 @@ class CommonTemplate:
+         msg = "expected .* and .* to have the same dtype, but got: .* != .*"
+         with self.assertRaisesRegex(RuntimeError, msg):
+             fn(t)
++        if config.cpp_wrapper:
++            msg = "aoti_torch_.* API call failed at .*"
+         with self.assertRaisesRegex(RuntimeError, msg):
+             with torch.no_grad():
+                 torch.compile(fn)(t)
+@@ -5065,6 +5088,7 @@ class CommonTemplate:
+         if self.device != "cpu":
+             assertGeneratedKernelCountEqual(self, 1)
+ 
++    @skip_if_cpp_wrapper
+     def test_complex_fallback(self):
+         def fn(x):
+             return x * x + 10
+@@ -5389,6 +5413,7 @@ class CommonTemplate:
+         )
+ 
+     @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
++    @skip_if_cpp_wrapper
+     def test_nonzero_unbacked_refinement(self):
+         def fn(x):
+             z = x.nonzero()
+@@ -5456,6 +5481,7 @@ class CommonTemplate:
+             (torch.randn([1, 3, 3, 16]).to(memory_format=torch.channels_last),),
+         )
+ 
++    @skip_if_cpp_wrapper
+     def test_cat_uint8(self):
+         def fn(x):
+             batch_shape = x.shape[:1]
+@@ -7846,6 +7872,7 @@ class CommonTemplate:
+         self.assertTrue((d < 1).all())
+ 
+     @config.patch(implicit_fallbacks=True)
++    @skip_if_cpp_wrapper
+     def test_fallback_mutable_op_basic(self):
+         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
+ 
+@@ -7956,6 +7983,7 @@ class CommonTemplate:
+             self.assertEqual(cloned_args, args)
+ 
+     @config.patch(implicit_fallbacks=True)
++    @skip_if_cpp_wrapper
+     def test_fallback_mutable_op_list(self):
+         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
+ 
+@@ -8082,6 +8110,7 @@ class CommonTemplate:
+ 
+     # Already on by default, just want to make sure
+     @patch.object(torch._inductor.config, "allow_buffer_reuse", True)
++    @skip_if_cpp_wrapper
+     def test_reuse_buffers_with_aliasing(self):
+         def f(x):
+             z = x + 1
+@@ -8164,6 +8193,7 @@ class CommonTemplate:
+         self.common(fn, [torch.zeros([20, 20])])
+ 
+     @config.patch(check_stack_no_cycles_TESTING_ONLY=True)
++    @skip_if_cpp_wrapper
+     def test_check_stack_no_cycles(self):
+         @torch.compile()
+         def fn(x):
+@@ -8579,6 +8609,7 @@ class CommonTemplate:
+         result = fn(torch.randn([1, 2, 16, 4]).requires_grad_())
+         result.sum().backward()
+ 
++    @skip_if_cpp_wrapper
+     def test_dropout2(self):
+         n = 100000
+         weight = torch.ones(
+@@ -8638,6 +8669,7 @@ class CommonTemplate:
+         self.assertTrue(same(g2, g3))
+ 
+     @config.patch(search_autotune_cache=False)
++    @skip_if_cpp_wrapper
+     def test_dropout3(self):
+         m = torch.nn.Sequential(
+             torch.nn.Linear(32, 32, bias=False),
+@@ -8664,6 +8696,7 @@ class CommonTemplate:
+             self.assertEqual(bw_code.count("tl.rand"), 0)
+         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
+ 
++    @skip_if_cpp_wrapper
+     def test_randint_kernel_count(self):
+         @torch._dynamo.optimize_assert("inductor")
+         def fn1():
+@@ -9302,6 +9335,7 @@ class CommonTemplate:
+         for x in (torch.randn(2, 3), torch.randn(2, 2), torch.randn(3, 2)):
+             self.common(fn, (x,))
+ 
++    @skip_if_cpp_wrapper
+     def test_kwargs(self):
+         if self.device == GPU_TYPE:
+             raise unittest.SkipTest("histogramdd only supports cpu")
+@@ -10627,6 +10661,7 @@ class CommonTemplate:
+ 
+     @requires_gpu()
+     @config.patch(implicit_fallbacks=True)
++    @skip_if_cpp_wrapper
+     def test_mutable_custom_op_fixed_layout2(self):
+         with torch.library._scoped_library("mylib", "DEF") as lib:
+             mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
+@@ -10680,6 +10715,7 @@ class CommonTemplate:
+                 self.assertNotEqual(bar_strides[0], expected_stride)
+ 
+     @config.patch(implicit_fallbacks=True)
++    @skip_if_cpp_wrapper
+     def test_mutable_custom_op_fixed_layout(self):
+         with torch.library._scoped_library("mylib", "DEF") as lib:
+             lib.define(
+@@ -11007,6 +11043,7 @@ class CommonTemplate:
+         assertGeneratedKernelCountEqual(self, 1)
+ 
+     @expectedFailureCodegenDynamic
++    @skip_if_cpp_wrapper
+     def test_reinterpret_dtypeview(self):
+         @torch.compile
+         def fn(x, x2):
+@@ -11827,6 +11864,7 @@ if HAS_GPU and not TEST_WITH_ASAN:
+             self.assertFalse("out_ptr0" in code)
+             self.assertEqual(fn_opt(*inps), fn(*inps))
+ 
++        @skip_if_cpp_wrapper
+         def test_numpy_on_gpu(self):
+             x = np.arange(10, dtype=np.float32)
+ 
+@@ -12210,6 +12248,7 @@ if HAS_GPU and not TEST_WITH_ASAN:
+ 
+         @patch("torch._inductor.config.comment_origin", True)
+         @patch("torch._functorch.config.max_dist_from_bw", 0)
++        @skip_if_cpp_wrapper
+         def test_inductor_sequence_nr(self):
+             class Model(torch.nn.Module):
+                 def __init__(self) -> None:
+@@ -12356,6 +12395,7 @@ if HAS_GPU and not TEST_WITH_ASAN:
+ 
+     class NanCheckerTest(TestCase):
+         @config.patch("nan_asserts", True)
++        @skip_if_cpp_wrapper
+         def test_nan_checker_pass(self):
+             def f(x):
+                 return torch.softmax(x, dim=-1)
+@@ -12375,6 +12415,7 @@ if HAS_GPU and not TEST_WITH_ASAN:
+             )
+ 
+         @config.patch("nan_asserts", True)
++        @skip_if_cpp_wrapper
+         def test_nan_checker_fail(self):
+             def f(x):
+                 return torch.softmax(x, dim=-1)
diff --git a/recipe/patches_submodules/0001-remove-DESTINATION-lib-from-CMake-install-directives.patch b/recipe/patches_submodules/0001-remove-DESTINATION-lib-from-CMake-install-directives.patch
new file mode 100644
index 00000000..665cc74e
--- /dev/null
+++ b/recipe/patches_submodules/0001-remove-DESTINATION-lib-from-CMake-install-directives.patch
@@ -0,0 +1,25 @@
+From a9879bdd5ea793c5301a4b86f163a07e1f28f321 Mon Sep 17 00:00:00 2001
+From: "H. Vetinari" <h.vetinari@gmx.com>
+Date: Tue, 28 Jan 2025 13:32:28 +1100
+Subject: [PATCH] remove `DESTINATION lib` from CMake install directives
+
+Suggested-By: Silvio Traversaro <silvio@traversaro.it>
+---
+ CMakeLists.txt | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/third_party/fbgemm/CMakeLists.txt b/third_party/fbgemm/CMakeLists.txt
+index 134523e7..86fb8fad 100644
+--- a/third_party/fbgemm/CMakeLists.txt
++++ b/third_party/fbgemm/CMakeLists.txt
+@@ -370,8 +370,8 @@ if(MSVC)
+       FILES $<TARGET_PDB_FILE:fbgemm> $<TARGET_PDB_FILE:asmjit>
+       DESTINATION ${CMAKE_INSTALL_LIBDIR} OPTIONAL)
+   endif()
+-  install(TARGETS fbgemm DESTINATION ${CMAKE_INSTALL_LIBDIR})
+-  install(TARGETS asmjit DESTINATION ${CMAKE_INSTALL_LIBDIR})
++  install(TARGETS fbgemm)
++  install(TARGETS asmjit)
+ endif()
+ 
+ #Make project importable from the build directory