diff --git a/abs.yaml b/abs.yaml index c317de07..0756ca38 100644 --- a/abs.yaml +++ b/abs.yaml @@ -1,6 +1,3 @@ -build_env_vars: - ANACONDA_ROCKET_ENABLE_PY313 : yes - # macOS 12.3 or above is required for running the GPU variant (MPS support). No way to specify this for only the GPU # variant, so it's specified for both. extra_labels_for_os: diff --git a/recipe/bld.bat b/recipe/bld.bat index a076bd7e..f9b1f24f 100644 --- a/recipe/bld.bat +++ b/recipe/bld.bat @@ -28,6 +28,8 @@ if "%pytorch_variant%" == "gpu" ( :: cudatoolkit different than the one specified at compile time. :: https://github.com/conda-forge/pytorch-cpu-feedstock/issues/135 set "USE_KINETO=OFF" +:: ITT fails on submodules due to a stricter cmake policy version requirement +set "USE_ITT=0" :: =============================== CUDA FLAGS> ====================================== if "%build_with_cuda%" == "" goto cuda_flags_end @@ -51,13 +53,15 @@ set DISTUTILS_USE_SDK=1 set BUILD_TEST=0 set INSTALL_TEST=0 :: Don't increase MAX_JOBS to NUMBER_OF_PROCESSORS, as it will run out of heap -set CPU_COUNT=1 +set CPU_COUNT=2 set MAX_JOBS=%CPU_COUNT% :: Use our Pybind11, Eigen set USE_SYSTEM_PYBIND11=1 set USE_SYSTEM_EIGEN_INSTALL=1 set CMAKE_INCLUDE_PATH=%LIBRARY_PREFIX%\include +set "CMAKE_ARGS=%CMAKE_ARGS% -DCMAKE_POLICY_VERSION_MINIMUM=3.5" + set LIB=%LIBRARY_PREFIX%\lib;%LIB% :: =============================== CUDA> ====================================== diff --git a/recipe/build.sh b/recipe/build.sh index c6e8bc61..faad1f7b 100644 --- a/recipe/build.sh +++ b/recipe/build.sh @@ -96,6 +96,9 @@ export Python3_EXECUTABLE="${PYTHON}" # export CCACHE_BASEDIR=${PREFIX}/../ # export CCACHE_NOHASHDIR=true +# Tell CMake to treat all old version requirements as 3.5+ +export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_POLICY_VERSION_MINIMUM=3.5" + for ARG in $CMAKE_ARGS; do if [[ "$ARG" == "-DCMAKE_"* ]]; then cmake_arg=$(echo $ARG | cut -d= -f1) @@ -183,6 +186,8 @@ fi # MacOS build is simple, and will not be for CUDA if [[ "$OSTYPE" == "darwin"* ]]; then + # XNNPACK causing issues at build time on osx with libcxx 17 + export USE_XNNPACK=0 # Produce macOS builds with torch.distributed support. # This is enabled by default on Linux, but disabled by default on macOS, # because it requires an non-bundled compile-time dependency (libuv @@ -190,6 +195,9 @@ if [[ "$OSTYPE" == "darwin"* ]]; then # we can override the default and set USE_DISTRIBUTED=1. export USE_DISTRIBUTED=1 + # c++ includes are not found in the build prefix by default on osx + export CXXFLAGS="$CXXFLAGS -I${BUILD_PREFIX}/include/c++/v1" + if [[ "$target_platform" == "osx-arm64" ]]; then # MKLDNN did not support on Apple M1 at the time support Apple M1 # was added. Revisit later diff --git a/recipe/conda_build_config.yaml b/recipe/conda_build_config.yaml index a35919a2..423d091f 100644 --- a/recipe/conda_build_config.yaml +++ b/recipe/conda_build_config.yaml @@ -1,21 +1,22 @@ gpu_variant: - cpu - metal # [(osx and arm64)] - - cuda-12 # [(linux and x86_64)] + # - cuda-12 # [(linux and x86_64)] c_compiler_version: # [osx] - 17 # [osx] cxx_compiler_version: # [osx] - 17 # [osx] + # CONDA_BUILD_SYSROOT is defined in the base cbc.yaml, but it's reflected here so we can zip the keys and # build GPU and CPU at the same time for osx-arm64. It'll need to be manually updated here if the base cbc is changed. # This could be done using extend_keys instead, with a change to the base cbc.yaml. # However there's currently a conda-forge bug that prevents this: https://github.com/conda/conda-build/issues/5048 MACOSX_SDK_VERSION: # [(osx and arm64)] - 11.1 # [(osx and arm64)] - #- 13.3 # [(osx and arm64)] + - 13.3 # [(osx and arm64)] CONDA_BUILD_SYSROOT: # [(osx and arm64)] - /Library/Developer/CommandLineTools/SDKs/MacOSX11.1.sdk # [(osx and arm64)] - #- /Library/Developer/CommandLineTools/SDKs/MacOSX13.3.sdk # [(osx and arm64)] + - /Library/Developer/CommandLineTools/SDKs/MacOSX13.3.sdk # [(osx and arm64)] zip_keys: # [(osx and arm64)] - gpu_variant # [(osx and arm64)] - MACOSX_SDK_VERSION # [(osx and arm64)] @@ -27,8 +28,8 @@ zip_keys: # [(osx and arm64)] # Conda-forge didn't do a "megabuild" on osx because it pushed their CI runners over their 6-hour limit. We don't have # such a limit. megabuild: -- true -#- false # [osx] +#- true +- false # The version of python to use when building libtorch in a "megabuild" megabuild_python: diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 4a34d38f..556615f8 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -2,7 +2,7 @@ {% set sha256 = "3005690eb7b083c443a38c7657938af63902f524ad87a6c83f1aca38c77e3b57" %} # Set the RC number to build release candidates. Set to None otherwise {% set rc = None %} -{% set build = 6 %} +{% set build = 7 %} # Keep this in sync with the release {% set smoke_test_commit = "1eba9b3aa3c43f86f4a2c807ac8e12c4a7767340" %} @@ -51,9 +51,12 @@ source: - patches/0010-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch - patches/0011-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch # [win] - patches_submodules/0001-remove-DESTINATION-lib-from-CMake-install-directives.patch # [win] + # - patches_submodules/0002-psimd-cmake.patch + # - patches_submodules/0003-fp16-cmake.patch - patches/0013-simplify-torch.utils.cpp_extension.include_paths-use.patch - patches/0014-point-include-paths-to-PREFIX-include.patch - patches/0015-point-lib-paths-to-PREFIX-lib.patch + - patches/0016-fix-issue-142484.patch # [blas_impl == "mkl" and win] {% endif %} - url: https://raw.githubusercontent.com/pytorch/pytorch/{{ smoke_test_commit }}/.ci/pytorch/smoke_test/smoke_test.py folder: smoke_test @@ -87,7 +90,7 @@ requirements: - cross-python_{{ target_platform }} # [build_platform != target_platform] - numpy * # [megabuild and build_platform != target_platform] - numpy # [not megabuild and build_platform != target_platform] - #- {{ stdlib('c') }} + - {{ stdlib('c') }} - {{ compiler('c') }} - {{ compiler('cxx') }} - {{ compiler('cuda') }} # [(gpu_variant or "").startswith("cuda")] @@ -114,8 +117,10 @@ requirements: # This has a strong run_export so we don't need to put it in `host` or `run` # We use llvm-openmp for openblas variants on osx. - llvm-openmp 17 # [osx and not (blas_impl == "mkl")] + - libcxx 17 # [osx] - libuv # [win] - - cmake + - cmake # [not win] + - cmake 3.31.2 # [win] - ninja-base - libabseil # Keep libprotobuf here so that a compatibile version @@ -159,7 +164,7 @@ requirements: - future # [py<313] - six - mkl-devel {{ mkl }} # [blas_impl == "mkl"] - - openblas-devel {{ openblas }} # [blas_impl == "openblas"] + - openblas-devel {{ openblas }} # [blas_impl == "openblas"] # - libcblas * *_mkl # [blas_impl == "mkl"] # - libcblas # [blas_impl != "mkl"] # - liblapack # [blas_impl != "mkl"] @@ -197,7 +202,8 @@ test: # for CMake config to find cuda & nvrtc - {{ compiler('cuda') }} # [(gpu_variant or "").startswith("cuda")] - cuda-nvrtc-dev # [(gpu_variant or "").startswith("cuda")] - - cmake + - cmake # [not win] + - cmake 3.31.2 # [win] - ninja - pkg-config files: @@ -211,7 +217,7 @@ test: {% for each_lib in ['libc10_cuda', 'libcaffe2_nvrtc', 'libtorch_cuda', 'libtorch_cuda_linalg'] %} - test -f $PREFIX/lib/{{ each_lib }}.so # [linux and (gpu_variant or "").startswith("cuda")] {% endfor %} - # test integrity of CMake metadata + # test integrity of CMake metadata - cd cmake_test - cmake -GNinja -DCMAKE_CXX_STANDARD=17 $CMAKE_ARGS . # [unix] - cmake -GNinja -DCMAKE_CXX_STANDARD=17 %CMAKE_ARGS% . # [win] @@ -219,6 +225,12 @@ test: outputs: - name: libtorch build: + overlinking_ignore_patterns: # [linux and aarch64] + - lib/libc10.so # [linux and aarch64] + - lib/libtorch_cpu.so # [linux and aarch64] + - lib/libshm.so # [linux and aarch64] + - lib/libtorch.so # [linux and aarch64] + - bin/torch_shm_manager # [linux and aarch64] missing_dso_whitelist: # The are dynamically loaded from %SP_DIR%\torch\lib\ - "**/asmjit.dll" # [win] @@ -261,7 +273,7 @@ outputs: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - numpy # [build_platform != target_platform] - #- {{ stdlib('c') }} + - {{ stdlib('c') }} - {{ compiler('c') }} - {{ compiler('cxx') }} - {{ compiler('cuda') }} # [(gpu_variant or "").startswith("cuda")] @@ -288,7 +300,9 @@ outputs: # This has a strong run_export so we don't need to put it in `host` or `run` # We use llvm-openmp for openblas variants on osx. - llvm-openmp 17 # [osx and not (blas_impl == "mkl")] - - cmake + - libcxx 17 # [osx] + - cmake # [not win] + - cmake 3.31.2 # [win] - ninja-base # Keep libprotobuf here so that a compatibile version # of protobuf is installed between build and host @@ -338,6 +352,7 @@ outputs: # For openblas on win and linux, we don't specify any openmp implementation; it comes from the compiler. - intel-openmp {{ mkl }} # [blas_impl == "mkl"] - llvm-openmp 17 # [osx and not (blas_impl == "mkl")] + - libcxx 17 # [osx] - libabseil - libprotobuf {{ libprotobuf }} - sleef 3.5.1 @@ -353,6 +368,7 @@ outputs: run: - {{ pin_compatible('intel-openmp') }} # [blas_impl == "mkl"] - llvm-openmp # [osx and not (blas_impl == "mkl")] + - libcxx 17 # [osx] # GPU requirements without run_exports - {{ pin_compatible('cudnn') }} # [(gpu_variant or "").startswith("cuda")] # Required for GPU profiler @@ -410,6 +426,7 @@ outputs: - pybind11 # the inductor "test_aoti_eager..." tests require objcopy - binutils # [linux] + - libcxx 17 # [osx] imports: - torch source_files: @@ -452,7 +469,14 @@ outputs: # Note that the `|| true` expression will make the build continue even if the whole script falls over completely # (for example, in the case of missing imports). There doesn't seem to be a way of making a script exception return # non-zero but failing tests return zero. - - python ./test/run_test.py --core --continue-through-error || true + # ------------------------------------------------------------------------------------------------ + # Exclude complex tests that are known to be flaky for -k "not (complex and (linalg_vecdot or dot or vdot))" + # https://github.com/pytorch/pytorch/issues/150918 + - python ./test/run_test.py --core --continue-through-error -k "not (complex and (linalg_vecdot or dot or vdot))" || true # [not win] + # lgamma or mvlgamma or multigammaln or gammaln all have these issues on a combination of Intel Xeon processors and Windows Server differences. + # enabling these tests on windows will cause numerical differences in the test suite. + # This is a non-deterministic issue where between 80-110 tests fail. This has been observed between Pytorch 2.5 and above. + - python ./test/run_test.py --core --continue-through-error -k "not ((complex and (linalg_vecdot or dot or vdot)) or lgamma or mvlgamma or multigammaln or gammaln)" || exit 0 # [win] # The inductor tests test the torch.compile backend. Using the options below avoids running distributed tests, # which would be run if we used the --inductor option. (Distributed tests would only be correctly run on a multi-gpu test platform, # which we don't have.) diff --git a/recipe/patches/0016-fix-issue-142484.patch b/recipe/patches/0016-fix-issue-142484.patch new file mode 100644 index 00000000..2764cb1e --- /dev/null +++ b/recipe/patches/0016-fix-issue-142484.patch @@ -0,0 +1,35 @@ +From 714ead5bf5c7e7ac0f91934232af2e1966b562fb Mon Sep 17 00:00:00 2001 +From: "Zheng, Zhaoqiong" +Date: Fri, 27 Dec 2024 13:49:36 +0800 +Subject: [PATCH] fix issue 142484 + +From https://github.com/pytorch/pytorch/pull/143894 +--- + aten/src/ATen/native/mkl/SpectralOps.cpp | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp +index e26cfbf6d..c61b76d32 100644 +--- a/aten/src/ATen/native/mkl/SpectralOps.cpp ++++ b/aten/src/ATen/native/mkl/SpectralOps.cpp +@@ -477,7 +477,17 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, + + const auto value_type = c10::toRealValueType(input.scalar_type()); + out.resize_(batched_out_sizes, MemoryFormat::Contiguous); +- ++ auto astrides = input.strides(); ++ bool all_zero = true; ++ for (const auto& stride : astrides) { ++ if (stride != 0) { ++ all_zero = false; ++ break; ++ } ++ } ++ if (all_zero) { ++ input = input.clone(MemoryFormat::Contiguous); ++ } + auto descriptor = _plan_mkl_fft( + input.strides(), out.strides(), signal_size, input.is_complex(), + out.is_complex(), normalization, forward, value_type); +-- +2.47.1 no \ No newline at end of file