diff --git a/abs.yaml b/abs.yaml
index c317de07..0756ca38 100644
--- a/abs.yaml
+++ b/abs.yaml
@@ -1,6 +1,3 @@
-build_env_vars:
-  ANACONDA_ROCKET_ENABLE_PY313 : yes
-  
 # macOS 12.3 or above is required for running the GPU variant (MPS support). No way to specify this for only the GPU
 # variant, so it's specified for both.
 extra_labels_for_os:
diff --git a/recipe/bld.bat b/recipe/bld.bat
index a076bd7e..f9b1f24f 100644
--- a/recipe/bld.bat
+++ b/recipe/bld.bat
@@ -28,6 +28,8 @@ if "%pytorch_variant%" == "gpu" (
 :: cudatoolkit different than the one specified at compile time.
 :: https://github.com/conda-forge/pytorch-cpu-feedstock/issues/135
 set "USE_KINETO=OFF"
+:: ITT fails on submodules due to a stricter cmake policy version requirement
+set "USE_ITT=0"
 
 :: =============================== CUDA FLAGS> ======================================
 if "%build_with_cuda%" == "" goto cuda_flags_end
@@ -51,13 +53,15 @@ set DISTUTILS_USE_SDK=1
 set BUILD_TEST=0
 set INSTALL_TEST=0
 :: Don't increase MAX_JOBS to NUMBER_OF_PROCESSORS, as it will run out of heap
-set CPU_COUNT=1
+set CPU_COUNT=2
 set MAX_JOBS=%CPU_COUNT%
 :: Use our Pybind11, Eigen
 set USE_SYSTEM_PYBIND11=1
 set USE_SYSTEM_EIGEN_INSTALL=1
 
 set CMAKE_INCLUDE_PATH=%LIBRARY_PREFIX%\include
+set "CMAKE_ARGS=%CMAKE_ARGS% -DCMAKE_POLICY_VERSION_MINIMUM=3.5"
+
 set LIB=%LIBRARY_PREFIX%\lib;%LIB%
 
 :: =============================== CUDA> ======================================
diff --git a/recipe/build.sh b/recipe/build.sh
index c6e8bc61..faad1f7b 100644
--- a/recipe/build.sh
+++ b/recipe/build.sh
@@ -96,6 +96,9 @@ export Python3_EXECUTABLE="${PYTHON}"
 # export CCACHE_BASEDIR=${PREFIX}/../
 # export CCACHE_NOHASHDIR=true
 
+# Tell CMake to treat all old version requirements as 3.5+
+export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_POLICY_VERSION_MINIMUM=3.5"
+
 for ARG in $CMAKE_ARGS; do
   if [[ "$ARG" == "-DCMAKE_"* ]]; then
     cmake_arg=$(echo $ARG | cut -d= -f1)
@@ -183,6 +186,8 @@ fi
 
 # MacOS build is simple, and will not be for CUDA
 if [[ "$OSTYPE" == "darwin"* ]]; then
+    # XNNPACK causing issues at build time on osx with libcxx 17
+    export USE_XNNPACK=0
     # Produce macOS builds with torch.distributed support.
     # This is enabled by default on Linux, but disabled by default on macOS,
     # because it requires an non-bundled compile-time dependency (libuv
@@ -190,6 +195,9 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
     # we can override the default and set USE_DISTRIBUTED=1.
     export USE_DISTRIBUTED=1
 
+    # c++ includes are not found in the build prefix by default on osx
+    export CXXFLAGS="$CXXFLAGS -I${BUILD_PREFIX}/include/c++/v1"
+
     if [[ "$target_platform" == "osx-arm64" ]]; then
         # MKLDNN did not support on Apple M1 at the time support Apple M1
         # was added. Revisit later
diff --git a/recipe/conda_build_config.yaml b/recipe/conda_build_config.yaml
index a35919a2..423d091f 100644
--- a/recipe/conda_build_config.yaml
+++ b/recipe/conda_build_config.yaml
@@ -1,21 +1,22 @@
 gpu_variant:
   - cpu
   - metal                    # [(osx and arm64)]
-  - cuda-12                  # [(linux and x86_64)]
+  # - cuda-12                  # [(linux and x86_64)]
 c_compiler_version:      # [osx]
   - 17                   # [osx]
 cxx_compiler_version:    # [osx]
   - 17                   # [osx]
+
 # CONDA_BUILD_SYSROOT is defined in the base cbc.yaml, but it's reflected here so we can zip the keys and
 # build GPU and CPU at the same time for osx-arm64. It'll need to be manually updated here if the base cbc is changed.
 # This could be done using extend_keys instead, with a change to the base cbc.yaml.
 # However there's currently a conda-forge bug that prevents this: https://github.com/conda/conda-build/issues/5048
 MACOSX_SDK_VERSION:          # [(osx and arm64)]
   - 11.1                     # [(osx and arm64)]
-  #- 13.3                     # [(osx and arm64)]
+  - 13.3                     # [(osx and arm64)]
 CONDA_BUILD_SYSROOT:         # [(osx and arm64)]
   - /Library/Developer/CommandLineTools/SDKs/MacOSX11.1.sdk  # [(osx and arm64)]
-  #- /Library/Developer/CommandLineTools/SDKs/MacOSX13.3.sdk  # [(osx and arm64)]
+  - /Library/Developer/CommandLineTools/SDKs/MacOSX13.3.sdk  # [(osx and arm64)]
 zip_keys:                    # [(osx and arm64)]
   - gpu_variant              # [(osx and arm64)]
   - MACOSX_SDK_VERSION       # [(osx and arm64)]
@@ -27,8 +28,8 @@ zip_keys:                    # [(osx and arm64)]
 # Conda-forge didn't do a "megabuild" on osx because it pushed their CI runners over their 6-hour limit. We don't have
 # such a limit.
 megabuild:
-- true
-#- false     # [osx]
+#- true
+- false
 
 # The version of python to use when building libtorch in a "megabuild"
 megabuild_python:
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index 4a34d38f..556615f8 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -2,7 +2,7 @@
 {% set sha256 = "3005690eb7b083c443a38c7657938af63902f524ad87a6c83f1aca38c77e3b57" %}
 # Set the RC number to build release candidates. Set to None otherwise
 {% set rc = None %}
-{% set build = 6 %}
+{% set build = 7 %}
 
 # Keep this in sync with the release
 {% set smoke_test_commit = "1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340" %}
@@ -51,9 +51,12 @@ source:
       - patches/0010-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch 
       - patches/0011-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch             # [win]
       - patches_submodules/0001-remove-DESTINATION-lib-from-CMake-install-directives.patch  # [win]
+      # - patches_submodules/0002-psimd-cmake.patch
+      # - patches_submodules/0003-fp16-cmake.patch
       - patches/0013-simplify-torch.utils.cpp_extension.include_paths-use.patch
       - patches/0014-point-include-paths-to-PREFIX-include.patch
       - patches/0015-point-lib-paths-to-PREFIX-lib.patch
+      - patches/0016-fix-issue-142484.patch # [blas_impl == "mkl" and win]
 {% endif %}
   - url: https://raw.githubusercontent.com/pytorch/pytorch/{{ smoke_test_commit }}/.ci/pytorch/smoke_test/smoke_test.py
     folder: smoke_test
@@ -87,7 +90,7 @@ requirements:
     - cross-python_{{ target_platform }}     # [build_platform != target_platform]
     - numpy  *                               # [megabuild and build_platform != target_platform]
     - numpy                                  # [not megabuild and build_platform != target_platform]
-    #- {{ stdlib('c') }}
+    - {{ stdlib('c') }}
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     - {{ compiler('cuda') }}                 # [(gpu_variant or "").startswith("cuda")]
@@ -114,8 +117,10 @@ requirements:
     # This has a strong run_export so we don't need to put it in `host` or `run`
     # We use llvm-openmp for openblas variants on osx.
     - llvm-openmp 17              # [osx and not (blas_impl == "mkl")]
+    - libcxx 17 # [osx] 
     - libuv     # [win]
-    - cmake
+    - cmake # [not win]
+    - cmake 3.31.2 # [win]
     - ninja-base
     - libabseil
     # Keep libprotobuf here so that a compatibile version
@@ -159,7 +164,7 @@ requirements:
     - future  # [py<313]
     - six
     - mkl-devel {{ mkl }}           # [blas_impl == "mkl"]
-    - openblas-devel {{ openblas }}   # [blas_impl == "openblas"]
+    - openblas-devel {{ openblas }}  # [blas_impl == "openblas"]
     # - libcblas * *_mkl      # [blas_impl == "mkl"]
     # - libcblas              # [blas_impl != "mkl"]
     # - liblapack             # [blas_impl != "mkl"]
@@ -197,7 +202,8 @@ test:
     # for CMake config to find cuda & nvrtc
     - {{ compiler('cuda') }}    # [(gpu_variant or "").startswith("cuda")]
     - cuda-nvrtc-dev            # [(gpu_variant or "").startswith("cuda")]
-    - cmake
+    - cmake # [not win]
+    - cmake 3.31.2 # [win]
     - ninja
     - pkg-config
   files:
@@ -211,7 +217,7 @@ test:
     {% for each_lib in ['libc10_cuda', 'libcaffe2_nvrtc', 'libtorch_cuda', 'libtorch_cuda_linalg'] %}
     - test -f $PREFIX/lib/{{ each_lib }}.so     # [linux and (gpu_variant or "").startswith("cuda")]
     {% endfor %}
-        # test integrity of CMake metadata
+    # test integrity of CMake metadata
     - cd cmake_test
     - cmake -GNinja -DCMAKE_CXX_STANDARD=17 $CMAKE_ARGS .   # [unix]
     - cmake -GNinja -DCMAKE_CXX_STANDARD=17 %CMAKE_ARGS% .  # [win]
@@ -219,6 +225,12 @@ test:
 outputs:
   - name: libtorch
     build:
+      overlinking_ignore_patterns: # [linux and aarch64]
+        - lib/libc10.so            # [linux and aarch64]
+        - lib/libtorch_cpu.so      # [linux and aarch64]
+        - lib/libshm.so            # [linux and aarch64]
+        - lib/libtorch.so          # [linux and aarch64]
+        - bin/torch_shm_manager    # [linux and aarch64]
       missing_dso_whitelist:
         # The are dynamically loaded from %SP_DIR%\torch\lib\
         - "**/asmjit.dll"             # [win]
@@ -261,7 +273,7 @@ outputs:
         - python                                 # [build_platform != target_platform]
         - cross-python_{{ target_platform }}     # [build_platform != target_platform]
         - numpy                                  # [build_platform != target_platform]
-        #- {{ stdlib('c') }}
+        - {{ stdlib('c') }}
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         - {{ compiler('cuda') }}                 # [(gpu_variant or "").startswith("cuda")]
@@ -288,7 +300,9 @@ outputs:
         # This has a strong run_export so we don't need to put it in `host` or `run`
         # We use llvm-openmp for openblas variants on osx.
         - llvm-openmp 17              # [osx and not (blas_impl == "mkl")]
-        - cmake
+        - libcxx 17 # [osx]
+        - cmake # [not win]
+        - cmake 3.31.2 # [win]
         - ninja-base
         # Keep libprotobuf here so that a compatibile version
         # of protobuf is installed between build and host
@@ -338,6 +352,7 @@ outputs:
         # For openblas on win and linux, we don't specify any openmp implementation; it comes from the compiler.
         - intel-openmp   {{ mkl }}        # [blas_impl == "mkl"]
         - llvm-openmp 17              # [osx and not (blas_impl == "mkl")]
+        - libcxx 17 # [osx]
         - libabseil
         - libprotobuf {{ libprotobuf }}
         - sleef 3.5.1
@@ -353,6 +368,7 @@ outputs:
       run:
         - {{ pin_compatible('intel-openmp') }}   # [blas_impl == "mkl"]
         - llvm-openmp                            # [osx and not (blas_impl == "mkl")]
+        - libcxx 17 # [osx]
         # GPU requirements without run_exports
         - {{ pin_compatible('cudnn') }}          # [(gpu_variant or "").startswith("cuda")]
         # Required for GPU profiler
@@ -410,6 +426,7 @@ outputs:
         - pybind11
         # the inductor "test_aoti_eager..." tests require objcopy
         - binutils  # [linux]
+        - libcxx 17 # [osx]
       imports:
         - torch
       source_files:
@@ -452,7 +469,14 @@ outputs:
         # Note that the `|| true` expression will make the build continue even if the whole script falls over completely
         # (for example, in the case of missing imports). There doesn't seem to be a way of making a script exception return
         # non-zero but failing tests return zero.
-        - python ./test/run_test.py --core --continue-through-error || true
+        # ------------------------------------------------------------------------------------------------
+        # Exclude complex tests that are known to be flaky for -k "not (complex and (linalg_vecdot or dot or vdot))"
+        # https://github.com/pytorch/pytorch/issues/150918
+        - python ./test/run_test.py --core --continue-through-error -k "not (complex and (linalg_vecdot or dot or vdot))" || true # [not win]
+        # lgamma or mvlgamma or multigammaln or gammaln all have these issues on a combination of Intel Xeon processors and Windows Server differences.
+        # enabling these tests on windows will cause numerical differences in the test suite.
+        # This is a non-deterministic issue where between 80-110 tests fail. This has been observed between Pytorch 2.5 and above.
+        - python ./test/run_test.py --core --continue-through-error -k "not ((complex and (linalg_vecdot or dot or vdot)) or lgamma or mvlgamma or multigammaln or gammaln)" || exit 0 # [win]
         # The inductor tests test the torch.compile backend. Using the options below avoids running distributed tests,
         # which would be run if we used the --inductor option. (Distributed tests would only be correctly run on a multi-gpu test platform,
         # which we don't have.)
diff --git a/recipe/patches/0016-fix-issue-142484.patch b/recipe/patches/0016-fix-issue-142484.patch
new file mode 100644
index 00000000..2764cb1e
--- /dev/null
+++ b/recipe/patches/0016-fix-issue-142484.patch
@@ -0,0 +1,35 @@
+From 714ead5bf5c7e7ac0f91934232af2e1966b562fb Mon Sep 17 00:00:00 2001
+From: "Zheng, Zhaoqiong" <zhaoqiong.zheng@intel.com>
+Date: Fri, 27 Dec 2024 13:49:36 +0800
+Subject: [PATCH] fix issue 142484
+
+From https://github.com/pytorch/pytorch/pull/143894
+---
+ aten/src/ATen/native/mkl/SpectralOps.cpp | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
+index e26cfbf6d..c61b76d32 100644
+--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
++++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
+@@ -477,7 +477,17 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
+ 
+   const auto value_type = c10::toRealValueType(input.scalar_type());
+   out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
+-
++  auto astrides = input.strides();
++  bool all_zero = true;
++  for (const auto& stride : astrides) {
++      if (stride != 0) {
++          all_zero = false;
++          break;
++      }
++  }
++  if (all_zero) {
++      input = input.clone(MemoryFormat::Contiguous);
++  }
+   auto descriptor = _plan_mkl_fft(
+       input.strides(), out.strides(), signal_size, input.is_complex(),
+       out.is_complex(), normalization, forward, value_type);
+-- 
+2.47.1 no
\ No newline at end of file