diff --git a/.bc-linter.yml b/.bc-linter.yml new file mode 100644 index 0000000000000..cafa3a51c3ac1 --- /dev/null +++ b/.bc-linter.yml @@ -0,0 +1,15 @@ +version: 1 +paths: +include: + - "**/*.py" +exclude: + - ".*" + - ".*/**" + - "**/.*/**" + - "**/.*" + - "**/_*/**" + - "**/_*.py" + - "**/test/**" + - "**/benchmarks/**" + - "**/test_*.py" + - "**/*_test.py" diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh index 424ddd0013cd8..bf8bab6dde232 100644 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ b/.ci/aarch64_linux/aarch64_ci_build.sh @@ -3,8 +3,18 @@ set -eux -o pipefail GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} -if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then - export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" +# Set CUDA architecture lists to match x86 build_cuda.sh +if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then + export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0" +elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then + export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0" +elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then + export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX" +fi + +# Compress the fatbin with -compress-mode=size for CUDA 13 +if [[ "$DESIRED_CUDA" == *"13"* ]]; then + export TORCH_NVCC_FLAGS="-compress-mode=size" fi SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" @@ -18,7 +28,7 @@ cd / # on the mounted pytorch repo git config --global --add safe.directory /pytorch pip install -r /pytorch/requirements.txt -pip install auditwheel==6.2.0 +pip install auditwheel==6.2.0 wheel if [ "$DESIRED_CUDA" = "cpu" ]; then echo "BASE_CUDA_VERSION is not set. Building cpu wheel." #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files @@ -26,6 +36,19 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then else echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" export USE_SYSTEM_NCCL=1 + + # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic) + if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then + echo "Bundling CUDA libraries with wheel for aarch64." + else + echo "Using nvidia libs from pypi for aarch64." + # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64 + # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"' + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}" + echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS" + export USE_NVIDIA_PYPI_LIBS=1 + fi + #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda fi diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py index d7bbdebc677ab..4bb9c64ea7772 100755 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py @@ -69,61 +69,181 @@ def replace_tag(filename) -> None: f.writelines(lines) +def patch_library_rpath( + folder: str, + lib_name: str, + use_nvidia_pypi_libs: bool = False, + desired_cuda: str = "", +) -> None: + """Apply patchelf to set RPATH for a library in torch/lib""" + lib_path = f"{folder}/tmp/torch/lib/{lib_name}" + + if use_nvidia_pypi_libs: + # For PyPI NVIDIA libraries, construct CUDA RPATH + cuda_rpaths = [ + "$ORIGIN/../../nvidia/cudnn/lib", + "$ORIGIN/../../nvidia/nvshmem/lib", + "$ORIGIN/../../nvidia/nccl/lib", + "$ORIGIN/../../nvidia/cusparselt/lib", + ] + + if "130" in desired_cuda: + cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib") + else: + cuda_rpaths.extend( + [ + "$ORIGIN/../../nvidia/cublas/lib", + "$ORIGIN/../../nvidia/cuda_cupti/lib", + "$ORIGIN/../../nvidia/cuda_nvrtc/lib", + "$ORIGIN/../../nvidia/cuda_runtime/lib", + "$ORIGIN/../../nvidia/cufft/lib", + "$ORIGIN/../../nvidia/curand/lib", + "$ORIGIN/../../nvidia/cusolver/lib", + "$ORIGIN/../../nvidia/cusparse/lib", + "$ORIGIN/../../nvidia/nvtx/lib", + "$ORIGIN/../../nvidia/cufile/lib", + ] + ) + + # Add $ORIGIN for local torch libs + rpath = ":".join(cuda_rpaths) + ":$ORIGIN" + else: + # For bundled libraries, just use $ORIGIN + rpath = "$ORIGIN" + + if os.path.exists(lib_path): + os.system( + f"cd {folder}/tmp/torch/lib/; " + f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}" + ) + + +def copy_and_patch_library( + src_path: str, + folder: str, + use_nvidia_pypi_libs: bool = False, + desired_cuda: str = "", +) -> None: + """Copy a library to torch/lib and patch its RPATH""" + if os.path.exists(src_path): + lib_name = os.path.basename(src_path) + shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}") + patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda) + + def package_cuda_wheel(wheel_path, desired_cuda) -> None: """ Package the cuda wheel libraries """ folder = os.path.dirname(wheel_path) - wheelname = os.path.basename(wheel_path) os.mkdir(f"{folder}/tmp") os.system(f"unzip {wheel_path} -d {folder}/tmp") - libs_to_copy = [ - "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12", - "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so", - "/usr/local/cuda/lib64/libcudnn.so.9", - "/usr/local/cuda/lib64/libcublas.so.12", - "/usr/local/cuda/lib64/libcublasLt.so.12", - "/usr/local/cuda/lib64/libcudart.so.12", - "/usr/local/cuda/lib64/libcufft.so.11", - "/usr/local/cuda/lib64/libcusparse.so.12", - "/usr/local/cuda/lib64/libcusparseLt.so.0", - "/usr/local/cuda/lib64/libcusolver.so.11", - "/usr/local/cuda/lib64/libcurand.so.10", - "/usr/local/cuda/lib64/libnccl.so.2", - "/usr/local/cuda/lib64/libnvJitLink.so.12", - "/usr/local/cuda/lib64/libnvrtc.so.12", - "/usr/local/cuda/lib64/libcudnn_adv.so.9", - "/usr/local/cuda/lib64/libcudnn_cnn.so.9", - "/usr/local/cuda/lib64/libcudnn_graph.so.9", - "/usr/local/cuda/lib64/libcudnn_ops.so.9", - "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9", - "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9", - "/usr/local/cuda/lib64/libcudnn_heuristic.so.9", - "/lib64/libgomp.so.1", - "/usr/lib64/libgfortran.so.5", - "/acl/build/libarm_compute.so", - "/acl/build/libarm_compute_graph.so", - "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0", - "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0", - "/usr/local/lib/libnvpl_lapack_core.so.0", - "/usr/local/lib/libnvpl_blas_core.so.0", - ] - if "129" in desired_cuda: - libs_to_copy += [ - "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9", + # Check if we should use PyPI NVIDIA libraries or bundle system libraries + use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1" + + if use_nvidia_pypi_libs: + print("Using nvidia libs from pypi - skipping CUDA library bundling") + # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages + # We only need to bundle non-NVIDIA libraries + minimal_libs_to_copy = [ + "/lib64/libgomp.so.1", + "/usr/lib64/libgfortran.so.5", + "/acl/build/libarm_compute.so", + "/acl/build/libarm_compute_graph.so", + "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_lapack_core.so.0", + "/usr/local/lib/libnvpl_blas_core.so.0", + ] + + # Copy minimal libraries to unzipped_folder/torch/lib + for lib_path in minimal_libs_to_copy: + copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda) + + # Patch torch libraries used for searching libraries + torch_libs_to_patch = [ + "libtorch.so", + "libtorch_cpu.so", + "libtorch_cuda.so", + "libtorch_cuda_linalg.so", + "libtorch_global_deps.so", + "libtorch_python.so", + "libtorch_nvshmem.so", + "libc10.so", + "libc10_cuda.so", + "libcaffe2_nvrtc.so", + "libshm.so", + ] + for lib_name in torch_libs_to_patch: + patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda) + else: + print("Bundling CUDA libraries with wheel") + # Original logic for bundling system CUDA libraries + # Common libraries for all CUDA versions + common_libs = [ + # Non-NVIDIA system libraries + "/lib64/libgomp.so.1", + "/usr/lib64/libgfortran.so.5", + "/acl/build/libarm_compute.so", + "/acl/build/libarm_compute_graph.so", + # Common CUDA libraries (same for all versions) + "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_lapack_core.so.0", + "/usr/local/lib/libnvpl_blas_core.so.0", + "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so", + "/usr/local/cuda/lib64/libcudnn.so.9", + "/usr/local/cuda/lib64/libcusparseLt.so.0", + "/usr/local/cuda/lib64/libcurand.so.10", + "/usr/local/cuda/lib64/libnccl.so.2", + "/usr/local/cuda/lib64/libnvshmem_host.so.3", + "/usr/local/cuda/lib64/libcudnn_adv.so.9", + "/usr/local/cuda/lib64/libcudnn_cnn.so.9", + "/usr/local/cuda/lib64/libcudnn_graph.so.9", + "/usr/local/cuda/lib64/libcudnn_ops.so.9", + "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9", + "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9", + "/usr/local/cuda/lib64/libcudnn_heuristic.so.9", "/usr/local/cuda/lib64/libcufile.so.0", "/usr/local/cuda/lib64/libcufile_rdma.so.1", + "/usr/local/cuda/lib64/libcusparse.so.12", ] - # Copy libraries to unzipped_folder/a/lib - for lib_path in libs_to_copy: - lib_name = os.path.basename(lib_path) - shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}") - os.system( - f"cd {folder}/tmp/torch/lib/; " - f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}" - ) + # CUDA version-specific libraries + if "130" in desired_cuda: + version_specific_libs = [ + "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13", + "/usr/local/cuda/lib64/libcublas.so.13", + "/usr/local/cuda/lib64/libcublasLt.so.13", + "/usr/local/cuda/lib64/libcudart.so.13", + "/usr/local/cuda/lib64/libcufft.so.12", + "/usr/local/cuda/lib64/libcusolver.so.12", + "/usr/local/cuda/lib64/libnvJitLink.so.13", + "/usr/local/cuda/lib64/libnvrtc.so.13", + "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0", + ] + elif "12" in desired_cuda: + # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9") + minor_version = desired_cuda[-1] + version_specific_libs = [ + "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12", + "/usr/local/cuda/lib64/libcublas.so.12", + "/usr/local/cuda/lib64/libcublasLt.so.12", + "/usr/local/cuda/lib64/libcudart.so.12", + "/usr/local/cuda/lib64/libcufft.so.11", + "/usr/local/cuda/lib64/libcusolver.so.11", + "/usr/local/cuda/lib64/libnvJitLink.so.12", + "/usr/local/cuda/lib64/libnvrtc.so.12", + f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}", + ] + + # Combine all libraries + libs_to_copy = common_libs + version_specific_libs + + # Copy libraries to unzipped_folder/torch/lib + for lib_path in libs_to_copy: + copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda) # Make sure the wheel is tagged with manylinux_2_28 for f in os.scandir(f"{folder}/tmp/"): @@ -131,14 +251,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: replace_tag(f"{f.path}/WHEEL") break - os.mkdir(f"{folder}/cuda_wheel") - os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *") - shutil.move( - f"{folder}/cuda_wheel/{wheelname}", - f"{folder}/{wheelname}", - copy_function=shutil.copy2, - ) - os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/") + os.system(f"wheel pack {folder}/tmp/ -d {folder}") + os.system(f"rm -rf {folder}/tmp/") def complete_wheel(folder: str) -> str: @@ -208,7 +322,17 @@ def parse_arguments(): build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) if enable_cuda: - build_vars = "MAX_JOBS=5 " + build_vars + build_vars += "MAX_JOBS=5 " + + # Handle PyPI NVIDIA libraries vs bundled libraries + use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1" + if use_nvidia_pypi_libs: + print("Configuring build for PyPI NVIDIA libraries") + # Configure for dynamic linking (matching x86 logic) + build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 " + else: + print("Configuring build for bundled NVIDIA libraries") + # Keep existing static linking approach - already configured above override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") desired_cuda = os.getenv("DESIRED_CUDA") diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py index 025d0a20579d4..7a4715d330060 100755 --- a/.ci/aarch64_linux/build_aarch64_wheel.py +++ b/.ci/aarch64_linux/build_aarch64_wheel.py @@ -438,9 +438,7 @@ def build_torchvision( ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: - build_vars += ( - f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}" - ) + build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" @@ -495,9 +493,7 @@ def build_torchdata( ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: - build_vars += ( - f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}" - ) + build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" @@ -553,9 +549,7 @@ def build_torchtext( ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: - build_vars += ( - f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}" - ) + build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" @@ -613,9 +607,7 @@ def build_torchaudio( ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: - build_vars += ( - f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}" - ) + build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" diff --git a/.ci/docker/README.md b/.ci/docker/README.md index 26c97754faa70..5a97a0a3c2d46 100644 --- a/.ci/docker/README.md +++ b/.ci/docker/README.md @@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`: ```bash docker build \ - .... - --build-arg "NEW_ARG_1=${NEW_ARG_1}" + .... + --build-arg "NEW_ARG_1=${NEW_ARG_1}" ``` 3. **Update Dockerfile logic**: diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile index 418a76ceac234..481d21b96cfe9 100644 --- a/.ci/docker/almalinux/Dockerfile +++ b/.ci/docker/almalinux/Dockerfile @@ -64,6 +64,10 @@ FROM cuda as cuda12.9 RUN bash ./install_cuda.sh 12.9 ENV DESIRED_CUDA=12.9 +FROM cuda as cuda13.0 +RUN bash ./install_cuda.sh 13.0 +ENV DESIRED_CUDA=13.0 + FROM ${ROCM_IMAGE} as rocm ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" ADD ./common/install_mkl.sh install_mkl.sh @@ -76,10 +80,10 @@ ADD ./common/install_mnist.sh install_mnist.sh RUN bash ./install_mnist.sh FROM base as all_cuda -COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8 COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6 COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8 COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9 +COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0 # Final step FROM ${BASE_TARGET} as final diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 689d6f43b8e98..48be0cf538054 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -76,10 +76,13 @@ elif [[ "$image" == *cuda*linter* ]]; then elif [[ "$image" == *linter* ]]; then # Use a separate Dockerfile for linter to keep a small image size DOCKERFILE="linter/Dockerfile" +elif [[ "$image" == *riscv* ]]; then + # Use RISC-V specific Dockerfile + DOCKERFILE="ubuntu-cross-riscv/Dockerfile" fi -_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb -_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b +_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152 +_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96 if [[ "$image" == *rocm* ]]; then _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6 _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d @@ -111,41 +114,18 @@ case "$tag" in UCC_COMMIT=${_UCC_COMMIT} TRITON=yes ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks) - CUDA_VERSION=12.8.1 + pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11) + CUDA_VERSION=13.0.0 ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=9 - VISION=yes - KATEX=yes - UCX_COMMIT=${_UCX_COMMIT} - UCC_COMMIT=${_UCC_COMMIT} - TRITON=yes - INDUCTOR_BENCHMARKS=yes - ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks) - CUDA_VERSION=12.8.1 - ANACONDA_PYTHON_VERSION=3.12 - GCC_VERSION=9 + GCC_VERSION=11 VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} TRITON=yes - INDUCTOR_BENCHMARKS=yes ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks) + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks) CUDA_VERSION=12.8.1 - ANACONDA_PYTHON_VERSION=3.13 - GCC_VERSION=9 - VISION=yes - KATEX=yes - UCX_COMMIT=${_UCX_COMMIT} - UCC_COMMIT=${_UCC_COMMIT} - TRITON=yes - INDUCTOR_BENCHMARKS=yes - ;; - pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9) - CUDA_VERSION=12.6.3 ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=9 VISION=yes @@ -153,6 +133,7 @@ case "$tag" in UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} TRITON=yes + INDUCTOR_BENCHMARKS=yes ;; pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm) CUDA_VERSION=12.8.1 @@ -164,39 +145,6 @@ case "$tag" in UCC_COMMIT=${_UCC_COMMIT} TRITON=yes ;; - pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks) - CUDA_VERSION=12.6 - ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=9 - VISION=yes - KATEX=yes - UCX_COMMIT=${_UCX_COMMIT} - UCC_COMMIT=${_UCC_COMMIT} - TRITON=yes - INDUCTOR_BENCHMARKS=yes - ;; - pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks) - CUDA_VERSION=12.6 - ANACONDA_PYTHON_VERSION=3.12 - GCC_VERSION=9 - VISION=yes - KATEX=yes - UCX_COMMIT=${_UCX_COMMIT} - UCC_COMMIT=${_UCC_COMMIT} - TRITON=yes - INDUCTOR_BENCHMARKS=yes - ;; - pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks) - CUDA_VERSION=12.6 - ANACONDA_PYTHON_VERSION=3.13 - GCC_VERSION=9 - VISION=yes - KATEX=yes - UCX_COMMIT=${_UCX_COMMIT} - UCC_COMMIT=${_UCC_COMMIT} - TRITON=yes - INDUCTOR_BENCHMARKS=yes - ;; pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9) CUDA_VERSION=12.8.1 ANACONDA_PYTHON_VERSION=3.10 @@ -208,30 +156,18 @@ case "$tag" in TRITON=yes ;; pytorch-linux-jammy-py3-clang12-onnx) - ANACONDA_PYTHON_VERSION=3.9 + ANACONDA_PYTHON_VERSION=3.10 CLANG_VERSION=12 VISION=yes ONNX=yes ;; - pytorch-linux-jammy-py3.9-clang12) - ANACONDA_PYTHON_VERSION=3.9 - CLANG_VERSION=12 - VISION=yes - TRITON=yes - ;; - pytorch-linux-jammy-py3.11-clang12) - ANACONDA_PYTHON_VERSION=3.11 + pytorch-linux-jammy-py3.10-clang12) + ANACONDA_PYTHON_VERSION=3.10 CLANG_VERSION=12 VISION=yes TRITON=yes ;; - pytorch-linux-jammy-py3.9-gcc9) - ANACONDA_PYTHON_VERSION=3.9 - GCC_VERSION=9 - VISION=yes - TRITON=yes - ;; - pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3) + pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3) if [[ $tag =~ "jammy" ]]; then ANACONDA_PYTHON_VERSION=3.10 else @@ -245,7 +181,9 @@ case "$tag" in KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} - INDUCTOR_BENCHMARKS=yes + if [[ $tag =~ "benchmarks" ]]; then + INDUCTOR_BENCHMARKS=yes + fi ;; pytorch-linux-noble-rocm-alpha-py3) ANACONDA_PYTHON_VERSION=3.12 @@ -257,26 +195,26 @@ case "$tag" in KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} - INDUCTOR_BENCHMARKS=yes PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950" ;; - pytorch-linux-jammy-xpu-2025.0-py3) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-xpu-n-1-py3) + ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes - XPU_VERSION=2025.0 + XPU_VERSION=2025.1 NINJA_VERSION=1.9.0 TRITON=yes ;; - pytorch-linux-jammy-xpu-2025.1-py3) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-xpu-n-py3) + ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes - XPU_VERSION=2025.1 + XPU_VERSION=2025.2 NINJA_VERSION=1.9.0 TRITON=yes ;; - pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks) + pytorch-linux-jammy-py3-gcc11-inductor-benchmarks) + # TODO (huydhn): Upgrade this to Python >= 3.10 ANACONDA_PYTHON_VERSION=3.9 GCC_VERSION=11 VISION=yes @@ -285,8 +223,8 @@ case "$tag" in DOCS=yes INDUCTOR_BENCHMARKS=yes ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12) + ANACONDA_PYTHON_VERSION=3.10 CUDA_VERSION=12.8.1 CLANG_VERSION=12 VISION=yes @@ -297,8 +235,8 @@ case "$tag" in CLANG_VERSION=18 VISION=yes ;; - pytorch-linux-jammy-py3.9-gcc11) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-py3.10-gcc11) + ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes KATEX=yes @@ -339,7 +277,6 @@ case "$tag" in GCC_VERSION=11 ACL=yes VISION=yes - CONDA_CMAKE=yes OPENBLAS=yes # snadampal: skipping llvm src build install because the current version # from pytorch/llvm:9.0.1 is x86 specific @@ -350,13 +287,15 @@ case "$tag" in GCC_VERSION=11 ACL=yes VISION=yes - CONDA_CMAKE=yes OPENBLAS=yes # snadampal: skipping llvm src build install because the current version # from pytorch/llvm:9.0.1 is x86 specific SKIP_LLVM_SRC_BUILD_INSTALL=yes INDUCTOR_BENCHMARKS=yes ;; + pytorch-linux-noble-riscv64-py3.12-gcc14) + GCC_VERSION=14 + ;; *) # Catch-all for builds that are not hardcoded. VISION=yes @@ -481,7 +420,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then fi if [ -n "$GCC_VERSION" ]; then - if !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then + if [[ "$image" == *riscv* ]]; then + # Check RISC-V cross-compilation toolchain version + if !(drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version 2>&1 | grep -q " $GCC_VERSION\\W"); then + echo "RISC-V GCC_VERSION=$GCC_VERSION, but:" + drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version + exit 1 + fi + elif !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then echo "GCC_VERSION=$GCC_VERSION, but:" drun gcc --version exit 1 diff --git a/.ci/docker/ci_commit_pins/huggingface-requirements.txt b/.ci/docker/ci_commit_pins/huggingface-requirements.txt new file mode 100644 index 0000000000000..66e5dbdfb1bb1 --- /dev/null +++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt @@ -0,0 +1,2 @@ +transformers==4.54.0 +soxr==0.5.0 diff --git a/.ci/docker/ci_commit_pins/huggingface.txt b/.ci/docker/ci_commit_pins/huggingface.txt deleted file mode 100644 index f00d6ca4f9ca7..0000000000000 --- a/.ci/docker/ci_commit_pins/huggingface.txt +++ /dev/null @@ -1 +0,0 @@ -243e186efbf7fb93328dd6b34927a4e8c8f24395 diff --git a/.ci/docker/ci_commit_pins/nccl-cu13.txt b/.ci/docker/ci_commit_pins/nccl-cu13.txt new file mode 100644 index 0000000000000..77202c1566019 --- /dev/null +++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt @@ -0,0 +1 @@ +v2.27.7-1 diff --git a/.ci/docker/ci_commit_pins/torchbench.txt b/.ci/docker/ci_commit_pins/torchbench.txt new file mode 100644 index 0000000000000..c9be7b440baea --- /dev/null +++ b/.ci/docker/ci_commit_pins/torchbench.txt @@ -0,0 +1 @@ +74a23feff57432129df84d8099e622773cf77925 diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt index 80d7d7ed18af9..b03606f6defc1 100644 --- a/.ci/docker/ci_commit_pins/triton-xpu.txt +++ b/.ci/docker/ci_commit_pins/triton-xpu.txt @@ -1 +1 @@ -ae324eeac8e102a2b40370e341460f3791353398 +1b0418a9a454b2b93ab8d71f40e59d2297157fae diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh index d7fc6ea264ddb..692edd0b898f1 100755 --- a/.ci/docker/common/install_cpython.sh +++ b/.ci/docker/common/install_cpython.sh @@ -66,8 +66,9 @@ function do_cpython_build { ln -s pip3 ${prefix}/bin/pip fi # install setuptools since python 3.12 is required to use distutils - ${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0 - local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))") + # packaging is needed to create symlink since wheel no longer provides needed information + ${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0 + local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))") ln -sf ${prefix} /opt/python/${abi_tag} } @@ -82,9 +83,9 @@ function build_cpython { py_suffix=${py_ver::-1} py_folder=$py_suffix fi - # Only b3 is available now + # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4 if [ "$py_suffix" == "3.14.0" ]; then - py_suffix="3.14.0b3" + py_suffix="3.14.0rc2" fi wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz do_cpython_build $py_ver Python-$py_suffix diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh index c8a780f65c8e5..c6808ea4a7a26 100644 --- a/.ci/docker/common/install_cuda.sh +++ b/.ci/docker/common/install_cuda.sh @@ -10,7 +10,7 @@ else arch_path='sbsa' fi -NVSHMEM_VERSION=3.3.9 +NVSHMEM_VERSION=3.3.24 function install_cuda { version=$1 @@ -62,14 +62,16 @@ function install_nvshmem { mkdir -p "${tmpdir}" && cd "${tmpdir}" # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html - filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}" - url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz" + # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver + filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive" + suffix=".tar.xz" + url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}" # download, unpack, install wget -q "${url}" - tar xf "${filename}.tar.gz" - cp -a "libnvshmem/include/"* /usr/local/include/ - cp -a "libnvshmem/lib/"* /usr/local/lib/ + tar xf "${filename}${suffix}" + cp -a "${filename}/include/"* /usr/local/cuda/include/ + cp -a "${filename}/lib/"* /usr/local/cuda/lib64/ # cleanup cd .. @@ -126,74 +128,6 @@ function install_129 { ldconfig } -function prune_124 { - echo "Pruning CUDA 12.4" - ##################################################################################### - # CUDA 12.4 prune static libs - ##################################################################################### - export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune" - export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64" - - export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" - export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" - - if [[ -n "$OVERRIDE_GENCODE" ]]; then - export GENCODE=$OVERRIDE_GENCODE - fi - if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then - export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN - fi - - # all CUDA libs except CuDNN and CuBLAS - ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \ - | xargs -I {} bash -c \ - "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}" - - # prune CuDNN and CuBLAS - $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a - $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a - - ##################################################################################### - # CUDA 12.4 prune visual tools - ##################################################################################### - export CUDA_BASE="/usr/local/cuda-12.4/" - rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/ -} - -function prune_126 { - echo "Pruning CUDA 12.6" - ##################################################################################### - # CUDA 12.6 prune static libs - ##################################################################################### - export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune" - export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64" - - export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" - export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" - - if [[ -n "$OVERRIDE_GENCODE" ]]; then - export GENCODE=$OVERRIDE_GENCODE - fi - if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then - export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN - fi - - # all CUDA libs except CuDNN and CuBLAS - ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \ - | xargs -I {} bash -c \ - "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}" - - # prune CuDNN and CuBLAS - $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a - $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a - - ##################################################################################### - # CUDA 12.6 prune visual tools - ##################################################################################### - export CUDA_BASE="/usr/local/cuda-12.6/" - rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/ -} - function install_128 { CUDNN_VERSION=9.8.0.87 echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" @@ -212,18 +146,38 @@ function install_128 { ldconfig } +function install_130 { + CUDNN_VERSION=9.13.0.50 + echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" + # install CUDA 13.0 in the same container + install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + install_cudnn 13 $CUDNN_VERSION + + install_nvshmem 13 $NVSHMEM_VERSION + + CUDA_VERSION=13.0 bash install_nccl.sh + + CUDA_VERSION=13.0 bash install_cusparselt.sh + + ldconfig +} + # idiomatic parameter and option handling in sh while test $# -gt 0 do case "$1" in - 12.4) install_124; prune_124 + 12.4) install_124; ;; - 12.6|12.6.*) install_126; prune_126 + 12.6|12.6.*) install_126; ;; 12.8|12.8.*) install_128; ;; 12.9|12.9.*) install_129; ;; + 13.0|13.0.*) install_130; + ;; *) echo "bad argument $1"; exit 1 ;; esac diff --git a/.ci/docker/common/install_cusparselt.sh b/.ci/docker/common/install_cusparselt.sh index feacb49f39eb5..b532c086371f1 100644 --- a/.ci/docker/common/install_cusparselt.sh +++ b/.ci/docker/common/install_cusparselt.sh @@ -5,7 +5,15 @@ set -ex # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html mkdir tmp_cusparselt && cd tmp_cusparselt -if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then +if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then + arch_path='sbsa' + export TARGETARCH=${TARGETARCH:-$(uname -m)} + if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then + arch_path='x86_64' + fi + CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive" + curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz +elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then arch_path='sbsa' export TARGETARCH=${TARGETARCH:-$(uname -m)} if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then diff --git a/.ci/docker/common/install_inductor_benchmark_deps.sh b/.ci/docker/common/install_inductor_benchmark_deps.sh index 7312dce170db2..81467d87f5140 100644 --- a/.ci/docker/common/install_inductor_benchmark_deps.sh +++ b/.ci/docker/common/install_inductor_benchmark_deps.sh @@ -5,9 +5,7 @@ set -ex source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" function install_huggingface() { - local version - commit=$(get_pinned_commit huggingface) - pip_install "git+https://github.com/huggingface/transformers@${commit}" + pip_install -r huggingface-requirements.txt } function install_timm() { @@ -15,11 +13,34 @@ function install_timm() { commit=$(get_pinned_commit timm) pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}" - # Clean up - conda_run pip uninstall -y torch torchvision triton +} + +function install_torchbench() { + local commit + commit=$(get_pinned_commit torchbench) + git clone https://github.com/pytorch/benchmark torchbench + pushd torchbench + git checkout "$commit" + + python install.py --continue_on_fail + + echo "Print all dependencies after TorchBench is installed" + python -mpip freeze + popd + + chown -R jenkins torchbench + chown -R jenkins /opt/conda } # Pango is needed for weasyprint which is needed for doctr conda_install pango + +# Stable packages are ok here, just to satisfy TorchBench check +pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 + +install_torchbench install_huggingface install_timm + +# Clean up +conda_run pip uninstall -y torch torchvision torchaudio triton torchao diff --git a/.ci/docker/common/install_nccl.sh b/.ci/docker/common/install_nccl.sh index 17d80ebe7d273..58a8e0b4e49c1 100644 --- a/.ci/docker/common/install_nccl.sh +++ b/.ci/docker/common/install_nccl.sh @@ -7,6 +7,8 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt) elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt) +elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then + NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt) else echo "Unexpected CUDA_VERSION ${CUDA_VERSION}" exit 1 diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh index d07ec32001635..9f23feb5adfaf 100755 --- a/.ci/docker/common/install_onnx.sh +++ b/.ci/docker/common/install_onnx.sh @@ -19,8 +19,8 @@ pip_install \ transformers==4.36.2 pip_install coloredlogs packaging -pip_install onnxruntime==1.18.1 -pip_install onnxscript==0.3.1 +pip_install onnxruntime==1.22.1 +pip_install onnxscript==0.4.0 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/ diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh index 726dfd1c74cfa..8e714bcb6cd32 100755 --- a/.ci/docker/common/install_triton.sh +++ b/.ci/docker/common/install_triton.sh @@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then cd python fi -pip_install pybind11==2.13.6 +pip_install pybind11==3.0.1 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py diff --git a/.ci/docker/common/install_ucc.sh b/.ci/docker/common/install_ucc.sh index b7f884ea9648f..04f15a52e88e3 100755 --- a/.ci/docker/common/install_ucc.sh +++ b/.ci/docker/common/install_ucc.sh @@ -44,8 +44,12 @@ function install_ucc() { ./autogen.sh - # We only run distributed tests on Tesla M60 and A10G - NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86" + if [[ -n "$CUDA_VERSION" && $CUDA_VERSION == 13* ]]; then + NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86" + else + # We only run distributed tests on Tesla M60 and A10G + NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86" + fi if [[ -n "$ROCM_VERSION" ]]; then if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh index ecbbb8ccccf89..0b150872f93ce 100644 --- a/.ci/docker/common/install_xpu.sh +++ b/.ci/docker/common/install_xpu.sh @@ -34,18 +34,27 @@ function install_ubuntu() { # The xpu-smi packages apt-get install -y flex bison xpu-smi - # Compute and Media Runtimes - apt-get install -y \ - intel-opencl-icd intel-level-zero-gpu level-zero \ - intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ - libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ - libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ - mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo - if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then - apt-get install -y intel-ocloc + + if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then + # Compute and Media Runtimes + apt-get install -y \ + intel-opencl-icd intel-level-zero-gpu level-zero \ + intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ + libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ + libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ + mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo + # Development Packages + apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev + else # rolling driver + apt-get install -y \ + intel-opencl-icd libze-intel-gpu1 libze1 \ + intel-media-va-driver-non-free libmfx-gen1 libvpl2 \ + libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ + libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ + mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc + apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev fi - # Development Packages - apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev + # Install Intel Support Packages apt-get install -y ${XPU_PACKAGES} @@ -56,10 +65,14 @@ function install_ubuntu() { function install_rhel() { . /etc/os-release - - if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then - echo "RHEL version ${VERSION_ID} not supported" - exit + if [[ "${ID}" == "rhel" ]]; then + if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then + echo "RHEL version ${VERSION_ID} not supported" + exit + fi + elif [[ "${ID}" == "almalinux" ]]; then + # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64 + VERSION_ID="8.8" fi dnf install -y 'dnf-command(config-manager)' @@ -130,18 +143,18 @@ function install_sles() { } -# Default use GPU driver LTS releases -XPU_DRIVER_VERSION="/lts/2350" -if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then - # Use GPU driver rolling releases - XPU_DRIVER_VERSION="" +# Default use GPU driver rolling releases +XPU_DRIVER_VERSION="" +if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then + # Use GPU driver LTS releases + XPU_DRIVER_VERSION="/lts/2350" fi -# Default use Intel® oneAPI Deep Learning Essentials 2025.0 -if [[ "$XPU_VERSION" == "2025.1" ]]; then - XPU_PACKAGES="intel-deep-learning-essentials-2025.1" +# Default use Intel® oneAPI Deep Learning Essentials 2025.1 +if [[ "$XPU_VERSION" == "2025.2" ]]; then + XPU_PACKAGES="intel-deep-learning-essentials-2025.2" else - XPU_PACKAGES="intel-deep-learning-essentials-2025.0" + XPU_PACKAGES="intel-deep-learning-essentials-2025.1" fi # The installation depends on the base OS diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile index a99a39d776267..c93f022268b25 100644 --- a/.ci/docker/libtorch/Dockerfile +++ b/.ci/docker/libtorch/Dockerfile @@ -69,6 +69,19 @@ RUN bash ./install_cuda.sh 12.9 RUN bash ./install_magma.sh 12.9 RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda +FROM cuda as cuda13.0 +RUN bash ./install_cuda.sh 13.0 +RUN bash ./install_magma.sh 13.0 +RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda + +# Install libibverbs for libtorch and copy to CUDA directory +RUN apt-get update -y && \ + apt-get install -y libibverbs-dev librdmacm-dev && \ + cp /usr/lib/x86_64-linux-gnu/libmlx5.so* /usr/local/cuda/lib64/ && \ + cp /usr/lib/x86_64-linux-gnu/librdmacm.so* /usr/local/cuda/lib64/ && \ + cp /usr/lib/x86_64-linux-gnu/libibverbs.so* /usr/local/cuda/lib64/ && \ + cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/ + FROM cpu as rocm ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28 index baee261d6ff65..4c5347b40629c 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28 +++ b/.ci/docker/manywheel/Dockerfile_2_28 @@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING RUN python3 -m pip install --upgrade pip && \ python3 -mpip install cmake==3.28.4 ADD ./common/install_xpu.sh install_xpu.sh -ENV XPU_VERSION 2025.1 +ENV XPU_VERSION 2025.2 RUN bash ./install_xpu.sh && rm install_xpu.sh RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index abe47bbe9188c..5dee4325857fb 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -67,6 +67,12 @@ case ${image} in DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13" MANY_LINUX_VERSION="2_28" ;; + manylinux2_28-builder:cuda13*) + TARGET=cuda_final + GPU_IMAGE=amd64/almalinux:8 + DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13" + MANY_LINUX_VERSION="2_28" + ;; manylinuxaarch64-builder:cuda*) TARGET=cuda_final GPU_IMAGE=amd64/almalinux:8 diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 12c2f5678c5a5..45fef66fd567f 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -63,11 +63,12 @@ lark==0.12.0 #Pinned versions: 0.12.0 #test that import: -librosa>=0.6.2 ; python_version < "3.11" -librosa==0.10.2 ; python_version == "3.12" +librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x" +librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x" #Description: A python package for music and audio analysis #Pinned versions: >=0.6.2 #test that import: test_spectral_ops.py +#librosa depends on numba; disable it for s390x while numba is disabled too #mkl #this breaks linux-bionic-rocm4.5-py3.7 #Description: Intel oneAPI Math Kernel Library @@ -116,6 +117,7 @@ numba==0.61.2 ; python_version > "3.9" #Pinned versions: 0.54.1, 0.49.0, <=0.49.1 #test that import: test_numba_integration.py #For numba issue see https://github.com/pytorch/pytorch/issues/51511 +#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073 #numpy #Description: Provides N-dimensional arrays and linear algebra @@ -299,7 +301,7 @@ pytest-cpp==2.3.0 #Pinned versions: 2.3.0 #test that import: -z3-solver==4.15.1.0 +z3-solver==4.15.1.0 ; platform_machine != "s390x" #Description: The Z3 Theorem Prover Project #Pinned versions: #test that import: @@ -335,7 +337,7 @@ onnx==1.18.0 #Pinned versions: #test that import: -onnxscript==0.3.1 +onnxscript==0.4.0 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal #Pinned versions: #test that import: @@ -354,7 +356,6 @@ pwlf==2.2.1 #Pinned versions: 2.2.1 #test that import: test_sac_estimator.py - # To build PyTorch itself pyyaml pyzstd @@ -376,7 +377,7 @@ dataclasses_json==0.6.7 cmake==4.0.0 #Description: required for building -tlparse==0.3.30 +tlparse==0.4.0 #Description: required for log parsing cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt index 3de4d8e0e44ec..efe6fb4c949b0 100644 --- a/.ci/docker/requirements-docs.txt +++ b/.ci/docker/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx==5.3.0 #Description: This is used to generate PyTorch docs #Pinned versions: 5.3.0 --e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2 +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # but it doesn't seem to work and hangs around idly. The initial thought that it is probably diff --git a/.ci/docker/triton_xpu_version.txt b/.ci/docker/triton_xpu_version.txt index 18091983f59dd..1545d966571dc 100644 --- a/.ci/docker/triton_xpu_version.txt +++ b/.ci/docker/triton_xpu_version.txt @@ -1 +1 @@ -3.4.0 +3.5.0 diff --git a/.ci/docker/ubuntu-cross-riscv/Dockerfile b/.ci/docker/ubuntu-cross-riscv/Dockerfile new file mode 100644 index 0000000000000..08201dc83216c --- /dev/null +++ b/.ci/docker/ubuntu-cross-riscv/Dockerfile @@ -0,0 +1,155 @@ +# Cross-compilation Docker container for RISC-V architecture +ARG UBUNTU_VERSION +FROM --platform=linux/amd64 ubuntu:${UBUNTU_VERSION} as base + +ARG UBUNTU_VERSION + +ENV GCC_VERSION=14 +ENV PYTHON_VERSION=3.12.3 +ENV DEBIAN_FRONTEND=noninteractive +ENV CC=riscv64-linux-gnu-gcc-${GCC_VERSION} +ENV CXX=riscv64-linux-gnu-g++-${GCC_VERSION} +ENV QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ +ENV SYSROOT=/opt/sysroot + +# Install basic dependencies +RUN apt-get update && apt-get install -y \ + ninja-build \ + autoconf \ + automake \ + libtool \ + patchelf \ + ccache \ + git \ + wget \ + python3-pip \ + python3-venv \ + python-is-python3 \ + cmake \ + sudo \ + lsb-release \ + gcc-${GCC_VERSION}-riscv64-linux-gnu \ + g++-${GCC_VERSION}-riscv64-linux-gnu \ + pkg-config \ + && rm -rf /var/lib/apt/lists/* + +# Install user +COPY ./common/install_user.sh install_user.sh +RUN bash ./install_user.sh && rm install_user.sh + +FROM base as python +ARG ZLIB_VERSION=1.3.1 +ARG FFI_VERSION=3.4.6 +ARG BZ2_VERSION=1.0.8 +ARG XZ_VERSION=5.4.6 +ARG OPENSSL_VERSION=3.2.1 + +# Set up sysroot directory for dependencies +ENV PKG_CONFIG_PATH=${SYSROOT}/lib/pkgconfig +ENV PKG_CONFIG_SYSROOT_DIR=${SYSROOT} + +WORKDIR /opt + +# Build zlib (for compression) +RUN echo "--- Building zlib ---" \ + && wget -c https://www.zlib.net/zlib-${ZLIB_VERSION}.tar.gz \ + && tar -xf zlib-${ZLIB_VERSION}.tar.gz --no-same-permissions --no-same-owner \ + && cd zlib-${ZLIB_VERSION}/ \ + && mkdir build && cd build \ + && ../configure --prefix=${SYSROOT} \ + && make -j$(nproc) && make install \ + && cd ../.. + +# Build libffi (for ctypes module) +RUN echo "--- Building libffi ---" \ + && wget -c https://github.com/libffi/libffi/releases/download/v${FFI_VERSION}/libffi-${FFI_VERSION}.tar.gz \ + && tar -xf libffi-${FFI_VERSION}.tar.gz --no-same-permissions --no-same-owner \ + && cd libffi-${FFI_VERSION}/ \ + && mkdir build && cd build \ + && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \ + && make -j$(nproc) && make install \ + && cd ../.. + +# Build bzip2 (for bz2 module) +RUN echo "--- Building bzip2 ---" \ + && wget -c https://sourceware.org/pub/bzip2/bzip2-${BZ2_VERSION}.tar.gz \ + && tar -xf bzip2-${BZ2_VERSION}.tar.gz --no-same-permissions --no-same-owner \ + && cd bzip2-${BZ2_VERSION}/ \ + && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} bzip2 bzip2recover libbz2.a \ + && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} -f Makefile-libbz2_so \ + && make install PREFIX=${SYSROOT} \ + && cp libbz2.so.${BZ2_VERSION} ${SYSROOT}/lib/ \ + && cd ${SYSROOT}/lib/ \ + && ln -sf libbz2.so.${BZ2_VERSION} libbz2.so.1.0 \ + && ln -sf libbz2.so.1.0 libbz2.so \ + && cd /opt/ + +# Build xz (for lzma module) +RUN echo "--- Building xz ---" \ + && wget -c https://github.com/tukaani-project/xz/releases/download/v${XZ_VERSION}/xz-${XZ_VERSION}.tar.gz \ + && tar -xf xz-${XZ_VERSION}.tar.gz --no-same-permissions --no-same-owner \ + && cd xz-${XZ_VERSION} \ + && mkdir build && cd build \ + && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \ + && make -j$(nproc) && make install \ + && cd ../.. + +# Build OpenSSL (for ssl module) +RUN echo "--- Building OpenSSL ---" \ + && wget -c https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz \ + && tar -xf openssl-${OPENSSL_VERSION}.tar.gz --no-same-permissions --no-same-owner \ + && cd openssl-${OPENSSL_VERSION}/ \ + && mkdir build && cd build \ + && ../Configure linux64-riscv64 --prefix=${SYSROOT} \ + && make -j$(nproc) && make install_sw \ + && cd ../.. + +# Build SQLite3 (for sqlite3 module) +RUN echo "--- Building SQLite3 ---" \ + && wget -c https://www.sqlite.org/2024/sqlite-autoconf-3450200.tar.gz \ + && tar -xf sqlite-autoconf-3450200.tar.gz --no-same-permissions --no-same-owner \ + && cd sqlite-autoconf-3450200 \ + && mkdir build && cd build \ + && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \ + && make -j$(nproc) && make install \ + && cd ../.. + +# Build and install RISC-V Python with all modules +RUN wget -c https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ + && tar -xf Python-${PYTHON_VERSION}.tgz --no-same-permissions --no-same-owner \ + && cd Python-${PYTHON_VERSION} \ + && mkdir build && cd build \ + && ../configure \ + --host=riscv64-linux-gnu \ + --build=x86_64-linux-gnu \ + --prefix=${SYSROOT} \ + --enable-shared \ + --disable-ipv6 \ + --with-build-python=/usr/bin/python3 \ + --with-ensurepip=no \ + ac_cv_file__dev_ptmx=yes \ + ac_cv_file__dev_ptc=no \ + && make -j$(nproc) \ + && make install + +FROM base as final +COPY --from=python /opt/sysroot /opt/sysroot + +# Install crossenv and cmake +RUN pip install crossenv cmake==4.0.0 --break-system-packages \ + && /usr/bin/python3 -m crossenv ${SYSROOT}/bin/python3 /opt/riscv-cross-env + +# Add pip-installed cmake binaries to PATH +ENV PATH="/usr/local/bin:${PATH}" + +# Set up cross Python environment +SHELL ["/bin/bash", "-c"] +RUN source /opt/riscv-cross-env/bin/activate \ + && pip install setuptools pyyaml typing_extensions wheel + +# Set default environment variables for PyTorch build +ENV Python_ROOT_DIR=${SYSROOT} +ENV OPENSSL_ROOT_DIR=${SYSROOT} + +USER jenkins +CMD ["bash"] diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile index 883248b884ed8..e5b672cc8e37f 100644 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -96,10 +96,11 @@ ARG ANACONDA_PYTHON_VERSION ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh COPY ./common/common_utils.sh common_utils.sh -COPY ci_commit_pins/huggingface.txt huggingface.txt +COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt COPY ci_commit_pins/timm.txt timm.txt +COPY ci_commit_pins/torchbench.txt torchbench.txt RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi -RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt +RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt # (optional) Install non-default Ninja version ARG NINJA_VERSION diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile index a0e7dce3df4d5..8765249688ce5 100644 --- a/.ci/docker/ubuntu-xpu/Dockerfile +++ b/.ci/docker/ubuntu-xpu/Dockerfile @@ -56,10 +56,10 @@ RUN rm install_openssl.sh ARG INDUCTOR_BENCHMARKS COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh COPY ./common/common_utils.sh common_utils.sh -COPY ci_commit_pins/huggingface.txt huggingface.txt +COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt COPY ci_commit_pins/timm.txt timm.txt RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi -RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt +RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt # Install XPU Dependencies ARG XPU_VERSION diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 27c466dd8d41d..1edc8c60c2f07 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -66,6 +66,7 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/" # (optional) Install UCC ARG UCX_COMMIT ARG UCC_COMMIT +ARG CUDA_VERSION ENV UCX_COMMIT $UCX_COMMIT ENV UCC_COMMIT $UCC_COMMIT ENV UCX_HOME /usr @@ -96,10 +97,11 @@ RUN rm install_openssl.sh ARG INDUCTOR_BENCHMARKS COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh COPY ./common/common_utils.sh common_utils.sh -COPY ci_commit_pins/huggingface.txt huggingface.txt +COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt COPY ci_commit_pins/timm.txt timm.txt +COPY ci_commit_pins/torchbench.txt torchbench.txt RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi -RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt +RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt ARG TRITON ARG TRITON_CPU @@ -180,7 +182,6 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi # AWS specific CUDA build guidance -ENV TORCH_CUDA_ARCH_LIST Maxwell ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all" ENV CUDA_PATH /usr/local/cuda diff --git a/.ci/libtorch/build.sh b/.ci/libtorch/build.sh index e822feb2674d9..54ddd905aad05 100644 --- a/.ci/libtorch/build.sh +++ b/.ci/libtorch/build.sh @@ -7,4 +7,4 @@ set -ex SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh +USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh diff --git a/.ci/lumen_cli/README.md b/.ci/lumen_cli/README.md new file mode 100644 index 0000000000000..a0bb8b19a000f --- /dev/null +++ b/.ci/lumen_cli/README.md @@ -0,0 +1,31 @@ +# 🔧 Lumen_cli +A Python CLI tool for building and testing PyTorch-based components, using a YAML configuration file for structured, repeatable workflows. + + +## Features +- **Build** + - external projects (e.g. vLLM) + +## 📦 Installation +at the root of the pytorch repo +```bash +pip install -e .ci/lumen_cli +``` + +## Run the cli tool +The cli tool must be used at root of pytorch repo, as example to run build external vllm: +```bash +python -m cli.run build external vllm +``` +this will run the build steps with default behaviour for vllm project. + +to see help messages, run +```bash +python3 -m cli.run --help +``` + +## Add customized external build logics +To add a new external build, for instance, add a new external build logics: +1. create the build function in cli/lib folder +2. register your target and the main build function at EXTERNAL_BUILD_TARGET_DISPATCH in `cli/build_cli/register_build.py` +3. [optional] create your ci config file in .github/ci_configs/${EXTERNAL_PACKAGE_NAME}.yaml diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked b/.ci/lumen_cli/cli/build_cli/__init__.py similarity index 100% rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked rename to .ci/lumen_cli/cli/build_cli/__init__.py diff --git a/.ci/lumen_cli/cli/build_cli/register_build.py b/.ci/lumen_cli/cli/build_cli/register_build.py new file mode 100644 index 0000000000000..9f35a9c8165dc --- /dev/null +++ b/.ci/lumen_cli/cli/build_cli/register_build.py @@ -0,0 +1,37 @@ +import argparse +import logging + +from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec +from cli.lib.core.vllm.vllm_build import VllmBuildRunner + + +logger = logging.getLogger(__name__) + +# Maps targets to their argparse configuration and runner +# it adds new target to path python -m cli.run build external {target} with buildrunner +_TARGETS: dict[str, TargetSpec] = { + "vllm": { + "runner": VllmBuildRunner, + "help": "Build vLLM using docker buildx.", + } + # add yours ... +} + + +def register_build_commands(subparsers: argparse._SubParsersAction) -> None: + build_parser = subparsers.add_parser( + "build", + help="Build related commands", + formatter_class=RichHelp, + ) + build_subparsers = build_parser.add_subparsers(dest="build_command", required=True) + overview = "\n".join( + f" {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items() + ) + external_parser = build_subparsers.add_parser( + "external", + help="Build external targets", + description="Build third-party targets.\n\nAvailable targets:\n" + overview, + formatter_class=RichHelp, + ) + register_targets(external_parser, _TARGETS) diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_bool_called_at_least_once b/.ci/lumen_cli/cli/lib/__init__.py similarity index 100% rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_bool_called_at_least_once rename to .ci/lumen_cli/cli/lib/__init__.py diff --git a/.ci/lumen_cli/cli/lib/common/cli_helper.py b/.ci/lumen_cli/cli/lib/common/cli_helper.py new file mode 100644 index 0000000000000..927ca09fe7230 --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py @@ -0,0 +1,71 @@ +""" +Cli Argparser Utility helpers for CLI tasks. + +""" + +import argparse +from abc import ABC, abstractmethod + + +try: + from typing import Any, Callable, Required, TypedDict # Python 3.11+ +except ImportError: + from typing import Any, Callable, TypedDict + + from typing_extensions import Required # Fallback for Python <3.11 + + +class BaseRunner(ABC): + def __init__(self, args: Any) -> None: + self.args = args + + @abstractmethod + def run(self) -> None: + """runs main logics, required""" + + +# Pretty help: keep newlines + show defaults +class RichHelp( + argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter +): + pass + + +class TargetSpec(TypedDict, total=False): + """CLI subcommand specification with bA.""" + + runner: Required[type[BaseRunner]] + help: str + description: str + add_arguments: Callable[[argparse.ArgumentParser], None] + + +def register_targets( + parser: argparse.ArgumentParser, + target_specs: dict[str, TargetSpec], + common_args: Callable[[argparse.ArgumentParser], None] = lambda _: None, +) -> None: + """Register target subcommands.""" + targets = parser.add_subparsers( + dest="target", + required=True, + metavar="{" + ",".join(target_specs.keys()) + "}", + ) + + for name, spec in target_specs.items(): + desc = spec.get("description") or spec["runner"].__doc__ or "" + + p = targets.add_parser( + name, + help=spec.get("help", ""), + description=desc.strip(), + formatter_class=RichHelp, + ) + p.set_defaults( + func=lambda args, cls=spec["runner"]: cls(args).run(), + _runner_class=spec["runner"], + ) + if "add_arguments" in spec and callable(spec["add_arguments"]): + spec["add_arguments"](p) + if common_args: + common_args(p) diff --git a/.ci/lumen_cli/cli/lib/common/docker_helper.py b/.ci/lumen_cli/cli/lib/common/docker_helper.py new file mode 100644 index 0000000000000..b5f0a90e2d47a --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/docker_helper.py @@ -0,0 +1,42 @@ +""" +Docker Utility helpers for CLI tasks. +""" + +import logging +from typing import Optional + +import docker +from docker.errors import APIError, NotFound + + +logger = logging.getLogger(__name__) + +# lazy singleton so we don't reconnect every call +_docker_client: Optional[docker.DockerClient] = None + + +def _get_client() -> docker.DockerClient: + global _docker_client + if _docker_client is None: + _docker_client = docker.from_env() + return _docker_client + + +def local_image_exists( + image_name: str, client: Optional[docker.DockerClient] = None +) -> bool: + """Return True if a local Docker image exists.""" + if not image_name: + return False + + client = client or _get_client() + try: + client.images.get(image_name) + return True + except (NotFound, APIError) as e: + logger.error( + "Error when checking Docker image '%s': %s", + image_name, + e.explanation if hasattr(e, "explanation") else str(e), + ) + return False diff --git a/.ci/lumen_cli/cli/lib/common/envs_helper.py b/.ci/lumen_cli/cli/lib/common/envs_helper.py new file mode 100644 index 0000000000000..a654e7f18ed9f --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/envs_helper.py @@ -0,0 +1,110 @@ +""" +Environment Variables and Dataclasses Utility helpers for CLI tasks. +""" + +import os +from dataclasses import field, fields, is_dataclass, MISSING +from pathlib import Path +from textwrap import indent +from typing import Optional, Union + +from cli.lib.common.utils import str2bool + + +def get_env(name: str, default: str = "") -> str: + """Get environment variable with default fallback.""" + return os.environ.get(name) or default + + +def env_path_optional( + name: str, + default: Optional[Union[str, Path]] = None, + resolve: bool = True, +) -> Optional[Path]: + """Get environment variable as optional Path.""" + val = get_env(name) or default + if not val: + return None + + path = Path(val) + return path.resolve() if resolve else path + + +def env_path( + name: str, + default: Optional[Union[str, Path]] = None, + resolve: bool = True, +) -> Path: + """Get environment variable as Path, raise if missing.""" + path = env_path_optional(name, default, resolve) + if not path: + raise ValueError(f"Missing path value for {name}") + return path + + +def env_bool( + name: str, + default: bool = False, +) -> bool: + val = get_env(name) + if not val: + return default + return str2bool(val) + + +def env_bool_field( + name: str, + default: bool = False, +): + return field(default_factory=lambda: env_bool(name, default)) + + +def env_path_field( + name: str, + default: Union[str, Path] = "", + *, + resolve: bool = True, +) -> Path: + return field(default_factory=lambda: env_path(name, default, resolve=resolve)) + + +def env_str_field( + name: str, + default: str = "", +) -> str: + return field(default_factory=lambda: get_env(name, default)) + + +def generate_dataclass_help(cls) -> str: + """Auto-generate help text for dataclass fields.""" + if not is_dataclass(cls): + raise TypeError(f"{cls} is not a dataclass") + + def get_value(f): + if f.default is not MISSING: + return f.default + if f.default_factory is not MISSING: + try: + return f.default_factory() + except Exception as e: + return f"" + return "" + + lines = [f"{f.name:<22} = {repr(get_value(f))}" for f in fields(cls)] + return indent("\n".join(lines), " ") + + +def with_params_help(params_cls: type, title: str = "Parameter defaults"): + """ + Class decorator that appends a help table generated from another dataclass + (e.g., VllmParameters) to the decorated class's docstring. + """ + if not is_dataclass(params_cls): + raise TypeError(f"{params_cls} must be a dataclass") + + def _decorator(cls: type) -> type: + block = generate_dataclass_help(params_cls) + cls.__doc__ = (cls.__doc__ or "") + f"\n\n{title}:\n{block}" + return cls + + return _decorator diff --git a/.ci/lumen_cli/cli/lib/common/gh_summary.py b/.ci/lumen_cli/cli/lib/common/gh_summary.py new file mode 100644 index 0000000000000..72bfaa76e7068 --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/gh_summary.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +import logging +import os +import textwrap +from pathlib import Path +from typing import TYPE_CHECKING + +from cli.lib.common.utils import get_wheels +from jinja2 import Template + + +if TYPE_CHECKING: + from collections.abc import Iterable, Mapping + + +logger = logging.getLogger(__name__) + +_TPL_CONTENT = Template( + textwrap.dedent("""\ + ## {{ title }} + + ```{{ lang }} + {{ content }} + ``` +""") +) + +_TPL_LIST_ITEMS = Template( + textwrap.dedent("""\ + ## {{ title }} + {% for it in items %} + - {{ it.pkg }}: {{ it.relpath }} + {% else %} + _(no item found)_ + {% endfor %} + """) +) + +_TPL_TABLE = Template( + textwrap.dedent("""\ + {%- if rows %} + | {{ cols | join(' | ') }} | + |{%- for _ in cols %} --- |{%- endfor %} + {%- for r in rows %} + | {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %} + {%- endfor %} + {%- else %} + _(no data)_ + {%- endif %} +""") +) + + +def gh_summary_path() -> Path | None: + """Return the Path to the GitHub step summary file, or None if not set.""" + p = os.environ.get("GITHUB_STEP_SUMMARY") + return Path(p) if p else None + + +def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool: + """ + Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set. + append_content: default true, if True, append to the end of the file, else overwrite the whole file + + Returns: + True if written successfully (in GitHub Actions environment), + False if skipped (e.g., running locally where the variable is not set). + """ + sp = gh_summary_path() + if not sp: + logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.") + return False + + md_clean = textwrap.dedent(md).strip() + "\n" + + mode = "a" if append_content else "w" + with sp.open(mode, encoding="utf-8") as f: + f.write(md_clean) + return True + + +def md_heading(text: str, level: int = 2) -> str: + """Generate a Markdown heading string with the given level (1-6).""" + return f"{'#' * max(1, min(level, 6))} {text}\n" + + +def md_details(summary: str, content: str) -> str: + """Generate a collapsible
block with a summary and inner content.""" + return f"
\n{summary}\n\n{content}\n\n
\n" + + +def summarize_content_from_file( + output_dir: Path, + freeze_file: str, + title: str = "Content from file", + code_lang: str = "", # e.g. "text" or "ini" +) -> bool: + f = Path(output_dir) / freeze_file + if not f.exists(): + return False + content = f.read_text(encoding="utf-8").strip() + md = render_content(content, title=title, lang=code_lang) + return write_gh_step_summary(md) + + +def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3): + items = get_wheels(path, max_depth=max_depth) + if not items: + return False + md = render_list(items, title=title) + return write_gh_step_summary(md) + + +def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str: + """ + Render a list of dicts as a Markdown table using Jinja template. + """ + rows = list(rows) + cols = list({k for r in rows for k in r.keys()}) + md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n" + return md + + +def render_list( + items: Iterable[str], + *, + title: str = "List", +) -> str: + tpl = _TPL_LIST_ITEMS + md = tpl.render(title=title, items=items) + return md + + +def render_content( + content: str, + *, + title: str = "Content", + lang: str = "text", +) -> str: + tpl = _TPL_CONTENT + md = tpl.render(title=title, content=content, lang=lang) + return md diff --git a/.ci/lumen_cli/cli/lib/common/git_helper.py b/.ci/lumen_cli/cli/lib/common/git_helper.py new file mode 100644 index 0000000000000..9833caca956cb --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/git_helper.py @@ -0,0 +1,69 @@ +""" +Git Utility helpers for CLI tasks. +""" + +import logging +from pathlib import Path + +from cli.lib.common.path_helper import remove_dir +from git import GitCommandError, RemoteProgress, Repo + + +logger = logging.getLogger(__name__) + + +class PrintProgress(RemoteProgress): + """Simple progress logger for git operations.""" + + def __init__(self, interval: int = 5): + super().__init__() + self._last_percent = -1 + self._interval = interval + + def update(self, op_code, cur, max=None, message=""): + msg = self._cur_line or message + if max and cur: + percent = int(cur / max * 100) + if percent != self._last_percent and percent % self._interval == 0: + self._last_percent = percent + logger.info("Progress: %d%% - %s", percent, msg) + elif msg: + logger.info(msg) + + +def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules=False): + """Clone repository with pinned commit and optional submodules.""" + dst = dst or target + + try: + logger.info("Cloning %s to %s", target, dst) + + # Clone and fetch + remove_dir(dst) + r = Repo.clone_from(repo, dst, progress=PrintProgress()) + r.git.fetch("--all", "--tags") + + # Checkout pinned commit + commit = get_post_build_pinned_commit(target) + logger.info("Checking out pinned %s commit %s", target, commit) + r.git.checkout(commit) + + # Update submodules if requested + if update_submodules and r.submodules: + logger.info("Updating %d submodule(s)", len(r.submodules)) + for sm in r.submodules: + sm.update(init=True, recursive=True, progress=PrintProgress()) + + logger.info("Successfully cloned %s", target) + return r, commit + + except GitCommandError as e: + logger.error("Git operation failed: %s", e) + raise + + +def get_post_build_pinned_commit(name: str, prefix=".github/ci_commit_pins") -> str: + path = Path(prefix) / f"{name}.txt" + if not path.exists(): + raise FileNotFoundError(f"Pin file not found: {path}") + return path.read_text(encoding="utf-8").strip() diff --git a/.ci/lumen_cli/cli/lib/common/logger.py b/.ci/lumen_cli/cli/lib/common/logger.py new file mode 100644 index 0000000000000..7a638206d9316 --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/logger.py @@ -0,0 +1,14 @@ +""" +Logger Utility helpers for CLI tasks. +""" + +import logging +import sys + + +def setup_logging(level: int = logging.INFO): + logging.basicConfig( + level=level, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + stream=sys.stdout, + ) diff --git a/.ci/lumen_cli/cli/lib/common/path_helper.py b/.ci/lumen_cli/cli/lib/common/path_helper.py new file mode 100644 index 0000000000000..4f74aa6e509de --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/path_helper.py @@ -0,0 +1,62 @@ +"""Path utility helpers for CLI tasks.""" + +import logging +import shutil +from pathlib import Path +from typing import Union + + +logger = logging.getLogger(__name__) + + +def get_path(path: Union[str, Path], resolve: bool = False) -> Path: + """Convert to Path object, optionally resolving to absolute path.""" + if not path: + raise ValueError("Path cannot be None or empty") + result = Path(path) + return result.resolve() if resolve else result + + +def ensure_dir_exists(path: Union[str, Path]) -> Path: + """Create directory if it doesn't exist.""" + path_obj = get_path(path) + path_obj.mkdir(parents=True, exist_ok=True) + return path_obj + + +def remove_dir(path: Union[str, Path, None]) -> None: + """Remove directory if it exists.""" + if not path: + return + path_obj = get_path(path) + if path_obj.exists(): + shutil.rmtree(path_obj) + + +def force_create_dir(path: Union[str, Path]) -> Path: + """Remove directory if exists, then create fresh empty directory.""" + remove_dir(path) + return ensure_dir_exists(path) + + +def copy(src: Union[str, Path], dst: Union[str, Path]) -> None: + """Copy file or directory from src to dst.""" + src_path = get_path(src, resolve=True) + dst_path = get_path(dst, resolve=True) + + if not src_path.exists(): + raise FileNotFoundError(f"Source does not exist: {src_path}") + + dst_path.parent.mkdir(parents=True, exist_ok=True) + + if src_path.is_file(): + shutil.copy2(src_path, dst_path) + elif src_path.is_dir(): + shutil.copytree(src_path, dst_path, dirs_exist_ok=True) + else: + raise ValueError(f"Unsupported path type: {src_path}") + + +def is_path_exist(path: Union[str, Path, None]) -> bool: + """Check if path exists.""" + return bool(path and get_path(path).exists()) diff --git a/.ci/lumen_cli/cli/lib/common/pip_helper.py b/.ci/lumen_cli/cli/lib/common/pip_helper.py new file mode 100644 index 0000000000000..a53747e24d256 --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py @@ -0,0 +1,71 @@ +import glob +import logging +import shlex +import shutil +import sys +from collections.abc import Iterable +from importlib.metadata import PackageNotFoundError, version # noqa: UP035 +from typing import Optional, Union + +from cli.lib.common.utils import run_command + + +logger = logging.getLogger(__name__) + + +def pip_install_packages( + packages: Iterable[str] = (), + env=None, + *, + requirements: Optional[str] = None, + constraints: Optional[str] = None, + prefer_uv: bool = False, +) -> None: + use_uv = prefer_uv and shutil.which("uv") is not None + base = ( + [sys.executable, "-m", "uv", "pip", "install"] + if use_uv + else [sys.executable, "-m", "pip", "install"] + ) + cmd = base[:] + if requirements: + cmd += ["-r", requirements] + if constraints: + cmd += ["-c", constraints] + cmd += list(packages) + logger.info("pip installing packages: %s", " ".join(map(shlex.quote, cmd))) + run_command(" ".join(map(shlex.quote, cmd)), env=env) + + +def pip_install_first_match(pattern: str, extras: Optional[str] = None, pref_uv=False): + wheel = first_matching_pkg(pattern) + target = f"{wheel}[{extras}]" if extras else wheel + logger.info("Installing %s...", target) + pip_install_packages([target], prefer_uv=pref_uv) + + +def run_python(args: Union[str, list[str]], env=None): + """ + Run the python in the current environment. + """ + if isinstance(args, str): + args = shlex.split(args) + cmd = [sys.executable] + args + run_command(" ".join(map(shlex.quote, cmd)), env=env) + + +def pkg_exists(name: str) -> bool: + try: + pkg_version = version(name) + logger.info("%s already exist with version: %s", name, pkg_version) + return True + except PackageNotFoundError: + logger.info("%s is not installed", name) + return False + + +def first_matching_pkg(pattern: str) -> str: + matches = sorted(glob.glob(pattern)) + if not matches: + raise FileNotFoundError(f"No wheel matching: {pattern}") + return matches[0] diff --git a/.ci/lumen_cli/cli/lib/common/utils.py b/.ci/lumen_cli/cli/lib/common/utils.py new file mode 100644 index 0000000000000..b03309810d986 --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/utils.py @@ -0,0 +1,139 @@ +""" +General Utility helpers for CLI tasks. +""" + +import logging +import os +import shlex +import subprocess +import sys +from contextlib import contextmanager +from pathlib import Path +from typing import Optional + + +logger = logging.getLogger(__name__) + + +def run_command( + cmd: str, + use_shell: bool = False, + log_cmd: bool = True, + cwd: Optional[str] = None, + env: Optional[dict] = None, + check: bool = True, +) -> int: + """Run a command with optional shell execution.""" + if use_shell: + args = cmd + log_prefix = "[shell]" + executable = "/bin/bash" + else: + args = shlex.split(cmd) + log_prefix = "[cmd]" + executable = None + + if log_cmd: + display_cmd = cmd if use_shell else " ".join(args) + logger.info("%s %s", log_prefix, display_cmd) + + run_env = {**os.environ, **(env or {})} + + proc = subprocess.run( + args, + shell=use_shell, + executable=executable, + stdout=sys.stdout, + stderr=sys.stderr, + cwd=cwd, + env=run_env, + check=False, + ) + + if check and proc.returncode != 0: + logger.error( + "%s Command failed (exit %s): %s", log_prefix, proc.returncode, cmd + ) + raise subprocess.CalledProcessError( + proc.returncode, args if not use_shell else cmd + ) + + return proc.returncode + + +def str2bool(value: Optional[str]) -> bool: + """Convert environment variables to boolean values.""" + if not value: + return False + if not isinstance(value, str): + raise ValueError( + f"Expected a string value for boolean conversion, got {type(value)}" + ) + value = value.strip().lower() + + true_value_set = {"1", "true", "t", "yes", "y", "on", "enable", "enabled", "found"} + false_value_set = {"0", "false", "f", "no", "n", "off", "disable"} + + if value in true_value_set: + return True + if value in false_value_set: + return False + raise ValueError(f"Invalid string value for boolean conversion: {value}") + + +@contextmanager +def temp_environ(updates: dict[str, str]): + """ + Temporarily set environment variables and restore them after the block. + Args: + updates: Dict of environment variables to set. + """ + missing = object() + old: dict[str, str | object] = {k: os.environ.get(k, missing) for k in updates} + try: + os.environ.update(updates) + yield + finally: + for k, v in old.items(): + if v is missing: + os.environ.pop(k, None) + else: + os.environ[k] = v # type: ignore[arg-type] + + +@contextmanager +def working_directory(path: str): + """ + Temporarily change the working directory inside a context. + """ + if not path: + # No-op context + yield + return + prev_cwd = os.getcwd() + try: + os.chdir(path) + yield + finally: + os.chdir(prev_cwd) + + +def get_wheels( + output_dir: Path, + max_depth: Optional[int] = None, +) -> list[str]: + """Return a list of wheels found in the given output directory.""" + root = Path(output_dir) + if not root.exists(): + return [] + items = [] + for dirpath, _, filenames in os.walk(root): + depth = Path(dirpath).relative_to(root).parts + if max_depth is not None and len(depth) > max_depth: + continue + for fname in sorted(filenames): + if fname.endswith(".whl"): + pkg = fname.split("-")[0] + relpath = str((Path(dirpath) / fname).relative_to(root)) + items.append({"pkg": pkg, "relpath": relpath}) + return items diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py new file mode 100644 index 0000000000000..98cfc807e284a --- /dev/null +++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py @@ -0,0 +1,296 @@ +import logging +import os +import textwrap +from typing import Any + +from cli.lib.common.gh_summary import write_gh_step_summary +from cli.lib.common.git_helper import clone_external_repo +from cli.lib.common.pip_helper import pip_install_packages +from cli.lib.common.utils import run_command, temp_environ, working_directory +from jinja2 import Template + + +logger = logging.getLogger(__name__) + +_TPL_VLLM_INFO = Template( + textwrap.dedent("""\ + ## Vllm against Pytorch CI Test Summary + **Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }}) + {%- if torch_sha %} + **Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }}) + {%- endif %} +""") +) + + +def sample_vllm_test_library(): + """ + Simple sample to unblock the vllm ci development, which is mimic to + https://github.com/vllm-project/vllm/blob/main/.buildkite/test-pipeline.yaml + see run_test_plan for more details + """ + # TODO(elainewy): Read from yaml file to handle the env and tests for vllm + return { + "vllm_basic_correctness_test": { + "title": "Basic Correctness Test", + "id": "vllm_basic_correctness_test", + "env_vars": { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + }, + "steps": [ + "pytest -v -s basic_correctness/test_cumem.py", + "pytest -v -s basic_correctness/test_basic_correctness.py", + "pytest -v -s basic_correctness/test_cpu_offload.py", + "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py", + ], + }, + "vllm_basic_models_test": { + "title": "Basic models test", + "id": "vllm_basic_models_test", + "steps": [ + "pytest -v -s models/test_transformers.py", + "pytest -v -s models/test_registry.py", + "pytest -v -s models/test_utils.py", + "pytest -v -s models/test_vision.py", + "pytest -v -s models/test_initialization.py", + ], + }, + "vllm_entrypoints_test": { + "title": "Entrypoints Test ", + "id": "vllm_entrypoints_test", + "env_vars": { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + }, + "steps": [ + " ".join( + [ + "pytest", + "-v", + "-s", + "entrypoints/llm", + "--ignore=entrypoints/llm/test_lazy_outlines.py", + "--ignore=entrypoints/llm/test_generate.py", + "--ignore=entrypoints/llm/test_generate_multiple_loras.py", + "--ignore=entrypoints/llm/test_collective_rpc.py", + ] + ), + "pytest -v -s entrypoints/llm/test_lazy_outlines.py", + "pytest -v -s entrypoints/llm/test_generate.py ", + "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode", + ], + }, + "vllm_regression_test": { + "title": "Regression Test", + "id": "vllm_regression_test", + "package_install": ["modelscope"], + "steps": [ + "pytest -v -s test_regression.py", + ], + }, + "vllm_lora_tp_test_distributed": { + "title": "LoRA TP Test (Distributed)", + "id": "vllm_lora_tp_test_distributed", + "env_vars": { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + }, + "num_gpus": 4, + "steps": [ + "pytest -v -s -x lora/test_chatglm3_tp.py", + "pytest -v -s -x lora/test_llama_tp.py", + "pytest -v -s -x lora/test_llm_with_multi_loras.py", + ], + }, + "vllm_distributed_test_28_failure_test": { + "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure", + "id": "vllm_distributed_test_28_failure_test", + "env_vars": { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + }, + "num_gpus": 4, + "steps": [ + "pytest -v -s distributed/test_sequence_parallel.py", + ], + }, + "vllm_lora_28_failure_test": { + "title": "LoRA pytorch 2.8 failure test", + "id": "vllm_lora_28_failure_test", + "steps": ["pytest -v lora/test_quant_model.py"], + }, + "vllm_multi_model_processor_test": { + "title": "Multi-Modal Processor Test", + "id": "vllm_multi_model_processor_test", + "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"], + "steps": [ + "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py", + ], + }, + "vllm_multi_model_test_28_failure_test": { + "title": "Multi-Model Test (Failed 2.8 release)", + "id": "vllm_multi_model_test_28_failure_test", + "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"], + "steps": [ + "pytest -v -s models/multimodal/generation/test_voxtral.py", + "pytest -v -s models/multimodal/pooling", + ], + }, + "vllm_pytorch_compilation_unit_tests": { + "title": "PyTorch Compilation Unit Tests", + "id": "vllm_pytorch_compilation_unit_tests", + "steps": [ + "pytest -v -s compile/test_pass_manager.py", + "pytest -v -s compile/test_fusion.py", + "pytest -v -s compile/test_fusion_attn.py", + "pytest -v -s compile/test_silu_mul_quant_fusion.py", + "pytest -v -s compile/test_sequence_parallelism.py", + "pytest -v -s compile/test_async_tp.py", + "pytest -v -s compile/test_fusion_all_reduce.py", + "pytest -v -s compile/test_decorator.py", + ], + }, + "vllm_languagde_model_test_extended_generation_28_failure_test": { + "title": "Language Models Test (Extended Generation) 2.8 release failure", + "id": "vllm_languagde_model_test_extended_generation_28_failure_test", + "package_install": [ + "--no-build-isolation", + "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8", + ], + "steps": [ + "pytest -v -s models/language/generation/test_mistral.py", + ], + }, + "vllm_distributed_test_2_gpu_28_failure_test": { + "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure", + "id": "vllm_distributed_test_2_gpu_28_failure_test", + "env_vars": { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + }, + "num_gpus": 4, + "steps": [ + "pytest -v -s distributed/test_sequence_parallel.py", + ], + }, + # TODO(elainewy):need to add g6 with 4 gpus to run this test + "vllm_lora_test": { + "title": "LoRA Test %N", + "id": "lora_test", + "parallelism": 4, + "steps": [ + "echo '[checking] list sharded lora tests:'", + " ".join( + [ + "pytest -q --collect-only lora", + "--shard-id=$$BUILDKITE_PARALLEL_JOB", + "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT", + "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py", + ] + ), + "echo '[checking] Done. list lora tests'", + " ".join( + [ + "pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB", + "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT", + "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py", + ] + ), + ], + }, + } + + +def check_parallelism(tests: Any, title: str, shard_id: int = 0, num_shards: int = 0): + """ + a method to check if the test plan is parallelism or not. + """ + parallelism = int(tests.get("parallelism", "0")) + is_parallel = parallelism and parallelism > 1 + + if not is_parallel: + return False + + if shard_id > num_shards: + raise RuntimeError( + f"Test {title} expects {num_shards} shards, but invalid {shard_id} is provided" + ) + + if num_shards != parallelism: + raise RuntimeError( + f"Test {title} expects {parallelism} shards, but invalid {num_shards} is provided" + ) + + return True + + +def run_test_plan( + test_plan: str, + test_target: str, + tests_map: dict[str, Any], + shard_id: int = 0, + num_shards: int = 0, +): + """ + a method to run list of tests based on the test plan. + """ + logger.info("run %s tests.....", test_target) + if test_plan not in tests_map: + raise RuntimeError( + f"test {test_plan} not found, please add it to test plan pool" + ) + tests = tests_map[test_plan] + pkgs = tests.get("package_install", []) + title = tests.get("title", "unknown test") + + is_parallel = check_parallelism(tests, title, shard_id, num_shards) + if is_parallel: + title = title.replace("%N", f"{shard_id}/{num_shards}") + + logger.info("Running tests: %s", title) + if pkgs: + logger.info("Installing packages: %s", pkgs) + pip_install_packages(packages=pkgs, prefer_uv=True) + with ( + working_directory(tests.get("working_directory", "tests")), + temp_environ(tests.get("env_vars", {})), + ): + failures = [] + for step in tests["steps"]: + logger.info("Running step: %s", step) + if is_parallel: + step = replace_buildkite_placeholders(step, shard_id, num_shards) + logger.info("Running parallel step: %s", step) + code = run_command(cmd=step, check=False, use_shell=True) + if code != 0: + failures.append(step) + logger.info("Finish running step: %s", step) + if failures: + logger.error("Failed tests: %s", failures) + raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}") + logger.info("Done. All tests passed") + + +def clone_vllm(dst: str = "vllm"): + _, commit = clone_external_repo( + target="vllm", + repo="https://github.com/vllm-project/vllm.git", + dst=dst, + update_submodules=True, + ) + return commit + + +def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str: + mapping = { + "$$BUILDKITE_PARALLEL_JOB_COUNT": str(num_shards), + "$$BUILDKITE_PARALLEL_JOB": str(shard_id), + } + for k in sorted(mapping, key=len, reverse=True): + step = step.replace(k, mapping[k]) + return step + + +def summarize_build_info(vllm_commit: str) -> bool: + torch_sha = os.getenv("GITHUB_SHA") + md = ( + _TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip() + + "\n" + ) + return write_gh_step_summary(md) diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py new file mode 100644 index 0000000000000..8db48065cb052 --- /dev/null +++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py @@ -0,0 +1,285 @@ +import logging +import os +import textwrap +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +from cli.lib.common.cli_helper import BaseRunner +from cli.lib.common.docker_helper import local_image_exists +from cli.lib.common.envs_helper import ( + env_bool_field, + env_path_field, + env_str_field, + with_params_help, +) +from cli.lib.common.gh_summary import ( + gh_summary_path, + summarize_content_from_file, + summarize_wheels, +) +from cli.lib.common.path_helper import ( + copy, + ensure_dir_exists, + force_create_dir, + get_path, + is_path_exist, +) +from cli.lib.common.utils import run_command +from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info + + +logger = logging.getLogger(__name__) + + +# Default path for docker build artifacts +_DEFAULT_RESULT_PATH = "./shared" + +# Temp folder in vllm work place to cp torch whls in vllm work directory for docker build +_VLLM_TEMP_FOLDER = "tmp" + + +@dataclass +class VllmBuildParameters: + """ + Parameters defining the vllm external input configurations. + Combine with VllmDockerBuildArgs to define the vllm build environment + """ + + # USE_TORCH_WHEEL: when true, use local Torch wheels; requires TORCH_WHEELS_PATH. + # Otherwise docker build pull torch nightly during build + # TORCH_WHEELS_PATH: directory containing local torch wheels when use_torch_whl is True + use_torch_whl: bool = env_bool_field("USE_TORCH_WHEEL", True) + torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist") + + # USE_LOCAL_BASE_IMAGE: when true, use an existing local Docker base image; requires BASE_IMAGE + # Otherwise, pull dockerfile's default image remotely + # BASE_IMAGE: name:tag (only needed when use_local_base_image is True) + use_local_base_image: bool = env_bool_field("USE_LOCAL_BASE_IMAGE", True) + base_image: str = env_str_field("BASE_IMAGE") + + # USE_LOCAL_DOCKERFILE: when true("1"), use a local Dockerfile; requires DOCKERFILE_PATH. + # otherwise, use vllm's default dockerfile.torch_nightly for build + # DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True" + use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True) + dockerfile_path: Path = env_path_field( + "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm" + ) + + # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts + output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm") + + # --- Build args ---------------------------------------------------------- + target_stage: str = env_str_field("TARGET_STAGE", "export-wheels") + + tag_name: str = env_str_field("TAG", "vllm-wheels") + + cuda_version: str = env_str_field("CUDA_VERSION", "12.8.1") + + python_version: str = env_str_field("PYTHON_VERSION", "3.12") + + max_jobs: str = env_str_field("MAX_JOBS", "64") + + sccache_bucket: str = env_str_field("SCCACHE_BUCKET") + + sccache_region: str = env_str_field("SCCACHE_REGION") + + torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9") + + def __post_init__(self): + checks = [ + ( + self.use_torch_whl, # flag + True, # trigger_value + "torch_whls_path", # resource + is_path_exist, # check_func + "TORCH_WHEELS_PATH is not provided, but USE_TORCH_WHEEL is set to 1", + ), + ( + self.use_local_base_image, + True, + "base_image", + local_image_exists, + f"BASE_IMAGE {self.base_image} does not found, but USE_LOCAL_BASE_IMAGE is set to 1", + ), + ( + self.use_local_dockerfile, + True, + "dockerfile_path", + is_path_exist, + " DOCKERFILE_PATH path does not found, but USE_LOCAL_DOCKERFILE is set to 1", + ), + ] + for flag, trigger_value, attr_name, check_func, error_msg in checks: + value = getattr(self, attr_name) + if flag == trigger_value: + if not value or not check_func(value): + raise ValueError(error_msg) + else: + logger.info("flag %s is not set", flag) + if not self.output_dir: + raise ValueError("missing required output_dir") + + +@with_params_help(VllmBuildParameters) +class VllmBuildRunner(BaseRunner): + """ + Build vLLM using docker buildx. + + Environment variable options: + "USE_TORCH_WHEEL": "1: use local wheels; 0: pull nightly from pypi", + "TORCH_WHEELS_PATH": "Path to local wheels (when USE_TORCH_WHEEL=1)", + + "USE_LOCAL_BASE_IMAGE": "1: use local base image; 0: default image", + "BASE_IMAGE": "name:tag to indicate base image the dockerfile depends on (when USE_LOCAL_BASE_IMAGE=1)", + + "USE_LOCAL_DOCKERFILE": "1: use local Dockerfile; 0: vllm repo default dockerfile.torch_nightly", + "DOCKERFILE_PATH": "Path to Dockerfile (when USE_LOCAL_DOCKERFILE=1)", + + "OUTPUT_DIR": "e.g. './shared'", + + "TORCH_CUDA_ARCH_LIST": "e.g. '8.0' or '8.0;9.0'", + "CUDA_VERSION": "e.g. '12.8.1'", + "PYTHON_VERSION": "e.g. '3.12'", + "MAX_JOBS": "e.g. '64'", + "SCCACHE_BUCKET": "e.g. 'my-bucket'", + "SCCACHE_REGION": "e.g. 'us-west-2'", + """ + + def __init__(self, args=None): + self.work_directory = "vllm" + + def run(self): + """ + main function to run vllm build + 1. prepare vllm build environment + 2. prepare the docker build command args + 3. run docker build + """ + inputs = VllmBuildParameters() + logger.info("Running vllm build with inputs: %s", inputs) + vllm_commit = clone_vllm() + + self.cp_dockerfile_if_exist(inputs) + # cp torch wheels from root direct to vllm workspace if exist + self.cp_torch_whls_if_exist(inputs) + + # make sure the output dir to store the build artifacts exist + ensure_dir_exists(Path(inputs.output_dir)) + + cmd = self._generate_docker_build_cmd(inputs) + logger.info("Running docker build: \n %s", cmd) + + try: + run_command(cmd, cwd="vllm", env=os.environ.copy()) + finally: + self.genearte_vllm_build_summary(vllm_commit, inputs) + + def genearte_vllm_build_summary( + self, vllm_commit: str, inputs: VllmBuildParameters + ): + if not gh_summary_path(): + return logger.info("Skipping, not detect GH Summary env var....") + logger.info("Generate GH Summary ...") + # summarize vllm build info + summarize_build_info(vllm_commit) + + # summarize vllm build artifacts + vllm_artifact_dir = inputs.output_dir / "wheels" + summarize_content_from_file( + vllm_artifact_dir, + "build_summary.txt", + title="Vllm build env pip package summary", + ) + summarize_wheels( + inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts" + ) + summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts") + + def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str: + if not inputs.use_torch_whl: + return "" + tmp_dir = f"./{self.work_directory}/{_VLLM_TEMP_FOLDER}" + tmp_path = Path(tmp_dir) + force_create_dir(tmp_path) + copy(inputs.torch_whls_path, tmp_dir) + return tmp_dir + + def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters): + if not inputs.use_local_dockerfile: + logger.info("using vllm default dockerfile.torch_nightly for build") + return + dockerfile_path = get_path(inputs.dockerfile_path, resolve=True) + vllm_torch_dockerfile = Path( + f"./{self.work_directory}/docker/Dockerfile.nightly_torch" + ) + copy(dockerfile_path, vllm_torch_dockerfile) + + def get_result_path(self, path): + """ + Get the absolute path of the result path + """ + if not path: + path = _DEFAULT_RESULT_PATH + abs_path = get_path(path, resolve=True) + return abs_path + + def _get_torch_wheel_path_arg(self, torch_whl_dir: Optional[Path]) -> str: + if not torch_whl_dir: + return "" + return f"--build-arg TORCH_WHEELS_PATH={_VLLM_TEMP_FOLDER}" + + def _get_base_image_args(self, inputs: VllmBuildParameters) -> tuple[str, str, str]: + """ + Returns: + - base_image_arg: docker buildx arg string for base image + - final_base_image_arg: docker buildx arg string for vllm-base stage + - pull_flag: --pull=true or --pull=false depending on whether the image exists locally + """ + if not inputs.use_local_base_image: + return "", "", "" + + base_image = inputs.base_image + + # set both base image and final base image to the same local image + base_image_arg = f"--build-arg BUILD_BASE_IMAGE={base_image}" + final_base_image_arg = f"--build-arg FINAL_BASE_IMAGE={base_image}" + + if local_image_exists(base_image): + pull_flag = "--pull=false" + return base_image_arg, final_base_image_arg, pull_flag + logger.info( + "[INFO] Local image not found:%s will try to pull from remote", {base_image} + ) + return base_image_arg, final_base_image_arg, "" + + def _generate_docker_build_cmd( + self, + inputs: VllmBuildParameters, + ) -> str: + base_image_arg, final_base_image_arg, pull_flag = self._get_base_image_args( + inputs + ) + torch_arg = self._get_torch_wheel_path_arg(inputs.torch_whls_path) + + return textwrap.dedent( + f""" + docker buildx build \ + --output type=local,dest={inputs.output_dir} \ + -f docker/Dockerfile.nightly_torch \ + {pull_flag} \ + {torch_arg} \ + {base_image_arg} \ + {final_base_image_arg} \ + --build-arg max_jobs={inputs.max_jobs} \ + --build-arg CUDA_VERSION={inputs.cuda_version} \ + --build-arg PYTHON_VERSION={inputs.python_version} \ + --build-arg USE_SCCACHE={int(bool(inputs.sccache_bucket and inputs.sccache_region))} \ + --build-arg SCCACHE_BUCKET_NAME={inputs.sccache_bucket} \ + --build-arg SCCACHE_REGION_NAME={inputs.sccache_region} \ + --build-arg torch_cuda_arch_list='{inputs.torch_cuda_arch_list}' \ + --target {inputs.target_stage} \ + -t {inputs.tag_name} \ + --progress=plain . + """ + ).strip() diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py new file mode 100644 index 0000000000000..76401e33f29fd --- /dev/null +++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py @@ -0,0 +1,269 @@ +import logging +import os +import re +import subprocess +import sys +from collections.abc import Iterable +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Any + +from cli.lib.common.cli_helper import BaseRunner +from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env +from cli.lib.common.path_helper import copy, remove_dir +from cli.lib.common.pip_helper import ( + pip_install_first_match, + pip_install_packages, + pkg_exists, + run_python, +) +from cli.lib.common.utils import run_command, working_directory +from cli.lib.core.vllm.lib import clone_vllm, run_test_plan, sample_vllm_test_library + + +logger = logging.getLogger(__name__) + + +@dataclass +class VllmTestParameters: + """ + Parameters defining the vllm external test input + + !!!DO NOT ADD SECRETS IN THIS CLASS!!! + you can put environment variable name in VllmTestParameters if it's not the same as the secret one + fetch secrests directly from env variables during runtime + """ + + torch_whls_path: Path = env_path_field("WHEELS_PATH", "./dist") + + vllm_whls_path: Path = env_path_field( + "VLLM_WHEELS_PATH", "./dist/external/vllm/wheels" + ) + + torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9") + + def __post_init__(self): + if not self.torch_whls_path.exists(): + raise ValueError("missing torch_whls_path") + if not self.vllm_whls_path.exists(): + raise ValueError("missing vllm_whls_path") + + +class TestInpuType(Enum): + TEST_PLAN = "test_plan" + UNKNOWN = "unknown" + + +class VllmTestRunner(BaseRunner): + def __init__(self, args: Any): + self.work_directory = "vllm" + self.test_plan = "" + self.test_type = TestInpuType.UNKNOWN + + self.shard_id = args.shard_id + self.num_shards = args.num_shards + + if args.test_plan: + self.test_plan = args.test_plan + self.test_type = TestInpuType.TEST_PLAN + + # Matches the structeur in the artifacts.zip from torcb build + self.TORCH_WHL_PATH_REGEX = "torch*.whl" + self.TORCH_WHL_EXTRA = "opt-einsum" + self.TORCH_ADDITIONAL_WHLS_REGEX = [ + "vision/torchvision*.whl", + "audio/torchaudio*.whl", + ] + + # Match the structure of the artifacts.zip from vllm external build + self.VLLM_TEST_WHLS_REGEX = [ + "xformers/*.whl", + "vllm/vllm*.whl", + "flashinfer-python/flashinfer*.whl", + ] + + def prepare(self): + """ + prepare test environment for vllm. This includes clone vllm repo, install all wheels, test dependencies and set env + """ + params = VllmTestParameters() + logger.info("Display VllmTestParameters %s", params) + self._set_envs(params) + + clone_vllm(dst=self.work_directory) + with working_directory(self.work_directory): + remove_dir(Path("vllm")) + self._install_wheels(params) + self._install_dependencies() + # verify the torches are not overridden by test dependencies + check_versions() + + def run(self): + """ + main function to run vllm test + """ + self.prepare() + try: + with working_directory(self.work_directory): + if self.test_type == TestInpuType.TEST_PLAN: + if self.num_shards > 1: + run_test_plan( + self.test_plan, + "vllm", + sample_vllm_test_library(), + self.shard_id, + self.num_shards, + ) + else: + run_test_plan( + self.test_plan, "vllm", sample_vllm_test_library() + ) + else: + raise ValueError(f"Unknown test type {self.test_type}") + finally: + # double check the torches are not overridden by other packages + check_versions() + + def _install_wheels(self, params: VllmTestParameters): + logger.info("Running vllm test with inputs: %s", params) + if not pkg_exists("torch"): + # install torch from local whls if it's not installed yet. + torch_p = f"{str(params.torch_whls_path)}/{self.TORCH_WHL_PATH_REGEX}" + pip_install_first_match(torch_p, self.TORCH_WHL_EXTRA) + + torch_whls_path = [ + f"{str(params.torch_whls_path)}/{whl_path}" + for whl_path in self.TORCH_ADDITIONAL_WHLS_REGEX + ] + for torch_whl in torch_whls_path: + pip_install_first_match(torch_whl) + logger.info("Done. Installed torch and other torch-related wheels ") + + logger.info("Installing vllm wheels") + vllm_whls_path = [ + f"{str(params.vllm_whls_path)}/{whl_path}" + for whl_path in self.VLLM_TEST_WHLS_REGEX + ] + for vllm_whl in vllm_whls_path: + pip_install_first_match(vllm_whl) + logger.info("Done. Installed vllm wheels") + + def _install_test_dependencies(self): + """ + This method replaces torch dependencies with local torch wheel info in + requirements/test.in file from vllm repo. then generates the test.txt + in runtime + """ + logger.info("generate test.txt from requirements/test.in with local torch whls") + preprocess_test_in() + copy("requirements/test.txt", "snapshot_constraint.txt") + + run_command( + f"{sys.executable} -m uv pip compile requirements/test.in " + "-o test.txt " + "--index-strategy unsafe-best-match " + "--constraint snapshot_constraint.txt " + "--torch-backend cu128" + ) + pip_install_packages(requirements="test.txt", prefer_uv=True) + logger.info("Done. installed requirements for test dependencies") + + def _install_dependencies(self): + pip_install_packages(packages=["-e", "tests/vllm_test_utils"], prefer_uv=True) + pip_install_packages(packages=["hf_transfer"], prefer_uv=True) + os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + + # using script from vllm repo to remove all torch packages from requirements txt + run_python("use_existing_torch.py") + + # install common packages + for requirements in ["requirements/common.txt", "requirements/build.txt"]: + pip_install_packages( + requirements=requirements, + prefer_uv=True, + ) + # install test packages + self._install_test_dependencies() + + def _set_envs(self, inputs: VllmTestParameters): + os.environ["TORCH_CUDA_ARCH_LIST"] = inputs.torch_cuda_arch_list + if not validate_cuda(get_env("TORCH_CUDA_ARCH_LIST")): + logger.warning( + "Missing supported TORCH_CUDA_ARCH_LIST. " + "Currently support TORCH_CUDA_ARCH_LIST env var " + "with supported arch [8.0, 8.9, 9.0]" + ) + + os.environ["HF_TOKEN"] = os.getenv("VLLM_TEST_HUGGING_FACE_TOKEN", "") + if not get_env("HF_TOKEN"): + raise ValueError( + "missing required HF_TOKEN, please set VLLM_TEST_HUGGING_FACE_TOKEN env var" + ) + if not get_env("TORCH_CUDA_ARCH_LIST"): + raise ValueError( + "missing required TORCH_CUDA_ARCH_LIST, please set TORCH_CUDA_ARCH_LIST env var" + ) + + +def preprocess_test_in( + target_file: str = "requirements/test.in", additional_packages: Iterable[str] = () +): + """ + This modifies the target_file file in place in vllm work directory. + It removes torch and unwanted packages in target_file and replace with local torch whls + package with format "$WHEEL_PACKAGE_NAME @ file://" + """ + additional_package_to_move = list(additional_packages or ()) + pkgs_to_remove = [ + "torch", + "torchvision", + "torchaudio", + "xformers", + "mamba_ssm", + ] + additional_package_to_move + # Read current requirements + target_path = Path(target_file) + lines = target_path.read_text().splitlines() + + pkgs_to_add = [] + + # Remove lines starting with the package names (==, @, >=) — case-insensitive + pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE) + kept_lines = [line for line in lines if not pattern.match(line)] + + # Get local installed torch/vision/audio from pip freeze + # This is hacky, but it works + pip_freeze = subprocess.check_output(["pip", "freeze"], text=True) + header_lines = [ + line + for line in pip_freeze.splitlines() + if re.match( + r"^(torch|torchvision|torchaudio)\s*@\s*file://", line, re.IGNORECASE + ) + ] + + # Write back: header_lines + blank + kept_lines + out_lines = header_lines + [""] + kept_lines + if pkgs_to_add: + out_lines += [""] + pkgs_to_add + + out = "\n".join(out_lines) + "\n" + target_path.write_text(out) + logger.info("[INFO] Updated %s", target_file) + + +def validate_cuda(value: str) -> bool: + VALID_VALUES = {"8.0", "8.9", "9.0"} + return all(v in VALID_VALUES for v in value.split()) + + +def check_versions(): + """ + check installed packages version + """ + logger.info("Double check installed packages") + patterns = ["torch", "xformers", "torchvision", "torchaudio", "vllm"] + for pkg in patterns: + pkg_exists(pkg) + logger.info("Done. checked installed packages") diff --git a/.ci/lumen_cli/cli/run.py b/.ci/lumen_cli/cli/run.py new file mode 100644 index 0000000000000..1711109170756 --- /dev/null +++ b/.ci/lumen_cli/cli/run.py @@ -0,0 +1,40 @@ +# main.py + +import argparse +import logging + +from cli.build_cli.register_build import register_build_commands +from cli.lib.common.logger import setup_logging +from cli.test_cli.register_test import register_test_commands + + +logger = logging.getLogger(__name__) + + +def main(): + # Define top-level parser + parser = argparse.ArgumentParser(description="Lumos CLI") + subparsers = parser.add_subparsers(dest="command", required=True) + parser.add_argument( + "--log-level", default="INFO", help="Log level (DEBUG, INFO, WARNING, ERROR)" + ) + + # registers second-level subcommands + register_build_commands(subparsers) + register_test_commands(subparsers) + + # parse args after all options are registered + args = parser.parse_args() + + # setup global logging + setup_logging(getattr(logging, args.log_level.upper(), logging.INFO)) + logger.debug("Parsed args: %s", args) + + if hasattr(args, "func"): + args.func(args) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_complex b/.ci/lumen_cli/cli/test_cli/__init__.py similarity index 100% rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_complex rename to .ci/lumen_cli/cli/test_cli/__init__.py diff --git a/.ci/lumen_cli/cli/test_cli/register_test.py b/.ci/lumen_cli/cli/test_cli/register_test.py new file mode 100644 index 0000000000000..2973341b83ed2 --- /dev/null +++ b/.ci/lumen_cli/cli/test_cli/register_test.py @@ -0,0 +1,62 @@ +import argparse +import logging + +from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec +from cli.lib.core.vllm.vllm_test import VllmTestRunner + + +logger = logging.getLogger(__name__) + +# Maps targets to their argparse configuration and runner +# it adds new target to path python -m cli.run build external {target} with buildrunner +_TARGETS: dict[str, TargetSpec] = { + "vllm": { + "runner": VllmTestRunner, + "help": "test vLLM with pytorch main", + } + # add yours ... +} + + +def common_args(parser: argparse.ArgumentParser) -> None: + """ + Add common CLI arguments to the given parser. + """ + parser.add_argument( + "--shard-id", + type=int, + default=1, + help="a shard id to run, e.g. '0,1,2,3'", + ) + parser.add_argument( + "--num-shards", + type=int, + default=1, + help="a number of shards to run, e.g. '4'", + ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + "-tp", + "--test-plan", + type=str, + help="a pre-defined test plan to run, e.g. 'basic_correctness_test'", + ) + + +def register_test_commands(subparsers: argparse._SubParsersAction) -> None: + build_parser = subparsers.add_parser( + "test", + help="test related commands", + formatter_class=RichHelp, + ) + build_subparsers = build_parser.add_subparsers(dest="test_command", required=True) + overview = "\n".join( + f" {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items() + ) + external_parser = build_subparsers.add_parser( + "external", + help="Test external targets", + description="Test third-party targets.\n\nAvailable targets:\n" + overview, + formatter_class=RichHelp, + ) + register_targets(external_parser, _TARGETS, common_args=common_args) diff --git a/.ci/lumen_cli/pyproject.toml b/.ci/lumen_cli/pyproject.toml new file mode 100644 index 0000000000000..bf5edc77d9250 --- /dev/null +++ b/.ci/lumen_cli/pyproject.toml @@ -0,0 +1,23 @@ +[project] +name = "lumen-ci" +version = "0.1.0" +dependencies = [ + "pyyaml==6.0.2", + "GitPython==3.1.45", + "docker==7.1.0", + "pytest==7.3.2", + "uv==0.8.6" +] + +[tool.setuptools] +packages = ["cli"] + +[tool.setuptools.package-dir] +cli = "cli" + +[tool.ruff.lint] +# Enable preview mode for linting +preview = true + +# Now you can select your preview rules, like RUF048 +extend-select = ["RUF048"] diff --git a/.ci/lumen_cli/tests/test_app.py b/.ci/lumen_cli/tests/test_app.py new file mode 100644 index 0000000000000..9d57b37f159d7 --- /dev/null +++ b/.ci/lumen_cli/tests/test_app.py @@ -0,0 +1,47 @@ +# tests/test_cli.py +import io +import sys +import unittest +from contextlib import redirect_stderr, redirect_stdout +from unittest.mock import patch + +from cli.run import main + + +class TestArgparseCLI(unittest.TestCase): + @patch("cli.build_cli.register_build.VllmBuildRunner.run", return_value=None) + @patch("cli.build_cli.register_build.VllmBuildRunner.__init__", return_value=None) + def test_cli_run_build_external(self, mock_init, mock_run): + from cli.run import main # import after patches if needed + + test_args = ["cli.run", "build", "external", "vllm"] + with patch.object(sys, "argv", test_args): + # argparse may call sys.exit on error; capture to avoid test aborts + try: + main() + except SystemExit: + pass + mock_init.assert_called_once() # got constructed + mock_run.assert_called_once_with() # run() called + + def test_build_help(self): + test_args = ["cli.run", "build", "--help"] + + with patch.object(sys, "argv", test_args): + stdout = io.StringIO() + stderr = io.StringIO() + + # --help always raises SystemExit(0) + with self.assertRaises(SystemExit) as cm: + with redirect_stdout(stdout), redirect_stderr(stderr): + main() + + self.assertEqual(cm.exception.code, 0) + + output = stdout.getvalue() + self.assertIn("usage", output) + self.assertIn("external", output) + + +if __name__ == "__main__": + unittest.main() diff --git a/.ci/lumen_cli/tests/test_cli_helper.py b/.ci/lumen_cli/tests/test_cli_helper.py new file mode 100644 index 0000000000000..848f22d6be200 --- /dev/null +++ b/.ci/lumen_cli/tests/test_cli_helper.py @@ -0,0 +1,115 @@ +import argparse +import io +import unittest +from contextlib import redirect_stderr +from unittest.mock import patch + +from cli.lib.common.cli_helper import BaseRunner, register_targets, RichHelp, TargetSpec + + +# ---- Dummy runners for unittests---- +class FooRunner(BaseRunner): + """Foo description from docstring.""" + + def run(self) -> None: # replaced by mock + pass + + +class BarRunner(BaseRunner): + def run(self) -> None: # replaced by mock + pass + + +def add_foo_args(p: argparse.ArgumentParser) -> None: + p.add_argument("--x", type=int, required=True, help="x value") + + +def common_args(p: argparse.ArgumentParser) -> None: + p.add_argument("--verbose", action="store_true", help="verbose flag") + + +def build_parser(specs: dict[str, TargetSpec]) -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="app", formatter_class=RichHelp) + register_targets( + parser=parser, + target_specs=specs, + common_args=common_args, + ) + return parser + + +def get_subparser( + parser: argparse.ArgumentParser, name: str +) -> argparse.ArgumentParser: + subparsers_action = next( + a + for a in parser._subparsers._group_actions # type: ignore[attr-defined] + if isinstance(a, argparse._SubParsersAction) + ) + return subparsers_action.choices[name] + + +class TestRegisterTargets(unittest.TestCase): + def test_metavar_lists_targets(self): + specs: dict[str, TargetSpec] = { + "foo": {"runner": FooRunner, "add_arguments": add_foo_args}, + "bar": {"runner": BarRunner}, + } + parser = build_parser(specs) + subparsers_action = next( + a + for a in parser._subparsers._group_actions # type: ignore[attr-defined] + if isinstance(a, argparse._SubParsersAction) + ) + self.assertEqual(subparsers_action.metavar, "{foo,bar}") + + def test_add_arguments_and_common_args_present(self): + specs: dict[str, TargetSpec] = { + "foo": {"runner": FooRunner, "add_arguments": add_foo_args}, + } + parser = build_parser(specs) + foo = get_subparser(parser, "foo") + help_text = foo.format_help() + self.assertIn("--x", help_text) + self.assertIn("--verbose", help_text) + + def test_runner_constructed_with_ns_and_run_called(self): + specs: dict[str, TargetSpec] = { + "foo": {"runner": FooRunner, "add_arguments": add_foo_args}, + } + parser = build_parser(specs) + + with ( + patch.object(FooRunner, "__init__", return_value=None) as mock_init, + patch.object(FooRunner, "run", return_value=None) as mock_run, + ): + ns = parser.parse_args(["foo", "--x", "3", "--verbose"]) + ns.func(ns) # set by register_targets + # __init__ received the Namespace + self.assertEqual(mock_init.call_count, 1) + (called_ns,), _ = mock_init.call_args + self.assertIsInstance(called_ns, argparse.Namespace) + # run() called with no args + mock_run.assert_called_once_with() + + def test_runner_docstring_used_as_description_when_missing(self): + specs: dict[str, TargetSpec] = { + "foo": {"runner": FooRunner, "add_arguments": add_foo_args}, + } + parser = build_parser(specs) + foo = get_subparser(parser, "foo") + help_text = foo.format_help() + self.assertIn("Foo description from docstring.", help_text) + + def test_missing_target_raises_systemexit_with_usage(self): + specs: dict[str, TargetSpec] = {"foo": {"runner": FooRunner}} + parser = build_parser(specs) + buf = io.StringIO() + with self.assertRaises(SystemExit), redirect_stderr(buf): + parser.parse_args([]) + err = buf.getvalue() + self.assertIn("usage:", err) + + +if __name__ == "__main__": + unittest.main() diff --git a/.ci/lumen_cli/tests/test_docker_helper.py b/.ci/lumen_cli/tests/test_docker_helper.py new file mode 100644 index 0000000000000..0f15cd4b99bad --- /dev/null +++ b/.ci/lumen_cli/tests/test_docker_helper.py @@ -0,0 +1,75 @@ +import unittest +from unittest import mock +from unittest.mock import MagicMock + +import docker.errors as derr +from cli.lib.common.docker_helper import _get_client, local_image_exists + + +class TestDockerImageHelpers(unittest.TestCase): + def setUp(self): + # Reset the singleton in the target module + patcher = mock.patch("cli.lib.common.docker_helper._docker_client", None) + self.addCleanup(patcher.stop) + patcher.start() + + def test_local_image_exists_true(self): + # Mock a docker client whose images.get returns an object (no exception) + mock_client = MagicMock() + mock_client.images.get.return_value = object() + ok = local_image_exists("repo:tag", client=mock_client) + self.assertTrue(ok) + + def test_local_image_exists_not_found_false(self): + mock_client = MagicMock() + # Raise docker.errors.NotFound + mock_client.images.get.side_effect = derr.NotFound("nope") + ok = local_image_exists("missing:latest", client=mock_client) + self.assertFalse(ok) + + def test_local_image_exists_api_error_false(self): + mock_client = MagicMock() + mock_client.images.get.side_effect = derr.APIError("boom", None) + + ok = local_image_exists("broken:tag", client=mock_client) + self.assertFalse(ok) + + def test_local_image_exists_uses_lazy_singleton(self): + # Patch docker.from_env used by _get_client() + with mock.patch( + "cli.lib.common.docker_helper.docker.from_env" + ) as mock_from_env: + mock_docker_client = MagicMock() + mock_from_env.return_value = mock_docker_client + + # First call should create and cache the client + c1 = _get_client() + self.assertIs(c1, mock_docker_client) + mock_from_env.assert_called_once() + + # Second call should reuse cached client (no extra from_env calls) + c2 = _get_client() + self.assertIs(c2, mock_docker_client) + mock_from_env.assert_called_once() # still once + + def test_local_image_exists_without_client_param_calls_get_client_once(self): + # Ensure _get_client is called and cached; local_image_exists should reuse it + with mock.patch("cli.lib.common.docker_helper._get_client") as mock_get_client: + mock_client = MagicMock() + mock_get_client.return_value = mock_client + + # 1st call + local_image_exists("repo:tag") + # 2nd call + local_image_exists("repo:tag2") + + # local_image_exists should call _get_client each time, + # but your _get_client itself caches docker.from_env. + self.assertEqual(mock_get_client.call_count, 2) + self.assertEqual(mock_client.images.get.call_count, 2) + mock_client.images.get.assert_any_call("repo:tag") + mock_client.images.get.assert_any_call("repo:tag2") + + +if __name__ == "__main__": + unittest.main() diff --git a/.ci/lumen_cli/tests/test_envs_helper.py b/.ci/lumen_cli/tests/test_envs_helper.py new file mode 100644 index 0000000000000..187f3016d7ea5 --- /dev/null +++ b/.ci/lumen_cli/tests/test_envs_helper.py @@ -0,0 +1,149 @@ +import os +import unittest +from dataclasses import dataclass +from pathlib import Path +from unittest.mock import patch + +import cli.lib.common.envs_helper as m + + +class TestEnvHelpers(unittest.TestCase): + def setUp(self): + # Keep a copy of the original environment to restore later + self._env_backup = dict(os.environ) + + def tearDown(self): + # Restore environment to original state + os.environ.clear() + os.environ.update(self._env_backup) + + # -------- get_env -------- + def test_get_env_unset_returns_default(self): + with patch.dict(os.environ, {}, clear=True): + self.assertEqual(m.get_env("FOO", "default"), "default") + + def test_get_env_empty_returns_default(self): + with patch.dict(os.environ, {"FOO": ""}, clear=True): + self.assertEqual(m.get_env("FOO", "default"), "default") + + def test_get_env_set_returns_value(self): + with patch.dict(os.environ, {"FOO": "bar"}, clear=True): + self.assertEqual(m.get_env("FOO", "default"), "bar") + + def test_get_env_not_exist_returns_default(self): + with patch.dict(os.environ, {"FOO": "bar"}, clear=True): + self.assertEqual(m.get_env("TEST_NOT_EXIST", "default"), "default") + + def test_get_env_not_exist_without_default(self): + with patch.dict(os.environ, {"FOO": "bar"}, clear=True): + self.assertEqual(m.get_env("TEST_NOT_EXIST"), "") + + # -------- env_bool -------- + def test_env_bool_uses_default_when_unset(self): + with patch.dict(os.environ, {}, clear=True): + self.assertTrue(m.env_bool("FLAG", default=True)) + self.assertFalse(m.env_bool("FLAG", default=False)) + + def test_env_bool_uses_str2bool_when_set(self): + # Patch str2bool used by env_bool so we don't depend on its exact behavior + def fake_str2bool(s: str) -> bool: + return s.lower() in {"1", "true", "yes", "on", "y"} + + with ( + patch.dict(os.environ, {"FLAG": "yEs"}, clear=True), + patch.object(m, "str2bool", fake_str2bool), + ): + self.assertTrue(m.env_bool("FLAG", default=False)) + + # -------- env_path_optional / env_path -------- + def test_env_path_optional_unset_returns_none_by_default(self): + with patch.dict(os.environ, {}, clear=True): + self.assertIsNone(m.env_path_optional("P")) + + def test_env_path_optional_unset_returns_none_when_env_var_is_empty(self): + with patch.dict(os.environ, {"P": ""}, clear=True): + self.assertIsNone(m.env_path_optional("P")) + + def test_env_path_optional_unset_returns_default_str(self): + # default as string; resolve=True by default -> absolute path + default_str = "x/y" + with patch.dict(os.environ, {}, clear=True): + p = m.env_path_optional("P", default=default_str) + self.assertIsInstance(p, Path) + self.assertIsNotNone(p) + if p: + self.assertTrue(p.is_absolute()) + self.assertEqual(p.parts[-2:], ("x", "y")) + + def test_env_path_optional_unset_returns_default_path_no_resolve(self): + d = Path("z") + with patch.dict(os.environ, {}, clear=True): + p = m.env_path_optional("P", default=d, resolve=False) + self.assertEqual(p, d) + + def test_env_path_optional_respects_resolve_true(self): + with patch.dict(os.environ, {"P": "a/b"}, clear=True): + p = m.env_path_optional("P", resolve=True) + self.assertIsInstance(p, Path) + if p: + self.assertTrue(p.is_absolute()) + + def test_env_path_optional_respects_resolve_false(self): + with patch.dict(os.environ, {"P": "rel/dir"}, clear=True): + p = m.env_path_optional("P", resolve=False) + self.assertEqual(p, Path("rel/dir")) + if p: + self.assertFalse(p.is_absolute()) + + def test_env_path_raises_when_missing_and_default_none(self): + with patch.dict(os.environ, {}, clear=True): + with self.assertRaises(ValueError): + m.env_path("P", None, resolve=True) + + def test_env_path_returns_path_when_present(self): + tmp = Path("./b").resolve() + with patch.dict(os.environ, {"P": str(tmp)}, clear=True): + p = m.env_path("P", None, resolve=True) + self.assertEqual(p, tmp) + + # -------- dataclass field helpers -------- + def test_dataclass_fields_read_env_at_instantiation(self): + @dataclass + class Cfg: + flag: bool = m.env_bool_field("FLAG", default=False) + out: Path = m.env_path_field("OUT", default="ab", resolve=True) + name: str = m.env_str_field("NAME", default="anon") + + # First instantiation + with patch.dict( + os.environ, {"FLAG": "true", "OUT": "outdir", "NAME": "alice"}, clear=True + ): + cfg1 = Cfg() + self.assertTrue(cfg1.flag) + self.assertIsInstance(cfg1.out, Path) + self.assertTrue(cfg1.out.is_absolute()) + self.assertEqual(cfg1.name, "alice") + cfg1.name = "bob" # change instance value + self.assertEqual(cfg1.name, "bob") # change is reflected + + # Change env; new instance should reflect new values + with patch.dict(os.environ, {"FLAG": "false", "NAME": ""}, clear=True): + cfg2 = Cfg() + self.assertFalse(cfg2.flag) # str2bool("false") -> False + self.assertTrue("ab" in str(cfg2.out)) + self.assertIsInstance(cfg2.out, Path) + self.assertTrue(cfg2.out.is_absolute()) + self.assertEqual(cfg2.name, "anon") # empty -> fallback to default + + def test_dataclass_path_field_with_default_value(self): + @dataclass + class C2: + out: Path = m.env_path_field("OUT", default="some/dir", resolve=False) + + with patch.dict(os.environ, {}, clear=True): + c = C2() + self.assertEqual(c.out, Path("some/dir")) + + +if __name__ == "__main__": + unittest.main() diff --git a/.ci/lumen_cli/tests/test_path_helper.py b/.ci/lumen_cli/tests/test_path_helper.py new file mode 100644 index 0000000000000..d90ffa5631f59 --- /dev/null +++ b/.ci/lumen_cli/tests/test_path_helper.py @@ -0,0 +1,122 @@ +# test_path_utils.py +# Run: pytest -q + +import os +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from cli.lib.common.path_helper import ( + copy, + ensure_dir_exists, + force_create_dir, + get_path, + is_path_exist, + remove_dir, +) + + +class TestPathHelper(unittest.TestCase): + def setUp(self): + self.tmpdir = TemporaryDirectory() + self.tmp_path = Path(self.tmpdir.name) + + def tearDown(self): + self.tmpdir.cleanup() + + # -------- get_path -------- + def test_get_path_returns_path_for_str(self): + # Use relative path to avoid absolute-ness + rel_str = "sub/f.txt" + os.chdir(self.tmp_path) + p = get_path(rel_str, resolve=False) + self.assertIsInstance(p, Path) + self.assertFalse(p.is_absolute()) + self.assertEqual(str(p), rel_str) + + def test_get_path_resolves(self): + rel_str = "sub/f.txt" + p = get_path(str(self.tmp_path / rel_str), resolve=True) + self.assertTrue(p.is_absolute()) + self.assertTrue(str(p).endswith(rel_str)) + + def test_get_path_with_path_input(self): + p_in = self.tmp_path / "sub/f.txt" + p_out = get_path(p_in, resolve=False) + self.assertTrue(str(p_out) == str(p_in)) + + def test_get_path_with_none_raises(self): + with self.assertRaises(ValueError): + get_path(None) # type: ignore[arg-type] + + def test_get_path_invalid_type_raises(self): + with self.assertRaises(TypeError): + get_path(123) # type: ignore[arg-type] + + # -------- ensure_dir_exists / force_create_dir / remove_dir -------- + def test_ensure_dir_exists_creates_and_is_idempotent(self): + d = self.tmp_path / "made" + ensure_dir_exists(d) + self.assertTrue(d.exists() and d.is_dir()) + ensure_dir_exists(d) + + def test_force_create_dir_clears_existing(self): + d = self.tmp_path / "fresh" + (d / "inner").mkdir(parents=True) + (d / "inner" / "f.txt").write_text("x") + force_create_dir(d) + self.assertTrue(d.exists()) + self.assertEqual(list(d.iterdir()), []) + + def test_remove_dir_none_is_noop(self): + remove_dir(None) # type: ignore[arg-type] + + def test_remove_dir_nonexistent_is_noop(self): + ghost = self.tmp_path / "ghost" + remove_dir(ghost) + + def test_remove_dir_accepts_str(self): + d = self.tmp_path / "to_rm" + d.mkdir() + remove_dir(str(d)) + self.assertFalse(d.exists()) + + # -------- copy -------- + def test_copy_file_to_file(self): + src = self.tmp_path / "src.txt" + dst = self.tmp_path / "out" / "dst.txt" + src.write_text("hello") + copy(src, dst) + self.assertEqual(dst.read_text(), "hello") + + def test_copy_dir_to_new_dir(self): + src = self.tmp_path / "srcdir" + (src / "a").mkdir(parents=True) + (src / "a" / "f.txt").write_text("content") + dst = self.tmp_path / "destdir" + copy(src, dst) + self.assertEqual((dst / "a" / "f.txt").read_text(), "content") + + def test_copy_dir_into_existing_dir_overwrite_true_merges(self): + src = self.tmp_path / "srcdir" + dst = self.tmp_path / "destdir" + (src / "x").mkdir(parents=True) + (src / "x" / "new.txt").write_text("new") + dst.mkdir() + (dst / "existing.txt").write_text("old") + copy(src, dst) + self.assertEqual((dst / "existing.txt").read_text(), "old") + self.assertEqual((dst / "x" / "new.txt").read_text(), "new") + + def test_is_str_path_exist(self): + p = self.tmp_path / "x.txt" + p.write_text("1") + self.assertTrue(is_path_exist(str(p))) + self.assertTrue(is_path_exist(p)) + self.assertFalse(is_path_exist(str(self.tmp_path / "missing"))) + self.assertFalse(is_path_exist(self.tmp_path / "missing")) + self.assertFalse(is_path_exist("")) + + +if __name__ == "__main__": + unittest.main() diff --git a/.ci/lumen_cli/tests/test_run_plan.py b/.ci/lumen_cli/tests/test_run_plan.py new file mode 100644 index 0000000000000..a85ed2e3986f6 --- /dev/null +++ b/.ci/lumen_cli/tests/test_run_plan.py @@ -0,0 +1,185 @@ +# tests/test_run_test_plan.py +import importlib +from contextlib import nullcontext +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + + +MOD = "cli.lib.core.vllm.lib" + +# We import inside tests so the MOD override above applies everywhere +run_test_plan_import_path = f"{MOD}.run_test_plan" + + +def _get_cmd(c): + # Support both kwargs and positional args + return c.kwargs.get("cmd", c.args[0] if c.args else None) + + +def _get_check(c): + if "check" in c.kwargs: + return c.kwargs["check"] + # If positional, assume second arg is 'check' when present; default False + return c.args[1] if len(c.args) > 1 else False + + +@pytest.fixture +def patch_module(monkeypatch): + """ + Patch helpers ('pip_install_packages', 'temp_environ', 'working_directory', + 'run_command', 'logger') inside the target module and expose them. + """ + module = importlib.import_module(MOD) + + # Create fakes/mocks + pip_install_packages = MagicMock(name="pip_install_packages") + run_command = MagicMock(name="run_command", return_value=0) + + # temp_environ / working_directory: record calls but act as context managers + temp_calls: list[dict] = [] + workdir_calls: list[str] = [] + + def fake_working_directory(path: str): + workdir_calls.append(path) + return nullcontext() + + def fake_temp_env(map: dict[str, str]): + temp_calls.append(map) + return nullcontext() + + logger = SimpleNamespace( + info=MagicMock(name="logger.info"), + error=MagicMock(name="logger.error"), + ) + + # Apply patches (raise if attribute doesn't exist) + monkeypatch.setattr( + module, "pip_install_packages", pip_install_packages, raising=True + ) + monkeypatch.setattr(module, "run_command", run_command, raising=True) + monkeypatch.setattr( + module, "working_directory", fake_working_directory, raising=True + ) + monkeypatch.setattr(module, "temp_environ", fake_temp_env, raising=True) + monkeypatch.setattr(module, "logger", logger, raising=True) + + return SimpleNamespace( + module=module, + run_test_plan=module.run_test_plan, # expose to avoid getattr("constant") (Ruff B009) + pip_install_packages=pip_install_packages, + run_command=run_command, + temp_calls=temp_calls, + workdir_calls=workdir_calls, + logger=logger, + ) + + +def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_module): + run_test_plan = patch_module.run_test_plan + + tests_map = { + "basic": { + "title": "Basic suite", + "package_install": [], + "working_directory": "tests", + "env_vars": {"GLOBAL_FLAG": "1"}, + "steps": [ + "export A=x && pytest -q", + "export B=y && pytest -q tests/unit", + ], + } + } + + # One exit code per step (export + two pytest) + patch_module.run_command.side_effect = [0, 0, 0] + + run_test_plan("basic", "cpu", tests_map) + + calls = patch_module.run_command.call_args_list + cmds = [_get_cmd(c) for c in calls] + checks = [_get_check(c) for c in calls] + + assert cmds == [ + "export A=x && pytest -q", + "export B=y && pytest -q tests/unit", + ] + assert all(chk is False for chk in checks) + + assert patch_module.workdir_calls == ["tests"] + assert patch_module.temp_calls == [{"GLOBAL_FLAG": "1"}] + + +def test_installs_packages_when_present(monkeypatch, patch_module): + run_test_plan = patch_module.module.run_test_plan + + tests_map = { + "with_pkgs": { + "title": "Needs deps", + "package_install": ["timm==1.0.0", "flash-attn"], + "steps": ["pytest -q"], + } + } + + patch_module.run_command.return_value = 0 + + run_test_plan("with_pkgs", "gpu", tests_map) + + patch_module.pip_install_packages.assert_called_once_with( + packages=["timm==1.0.0", "flash-attn"], + prefer_uv=True, + ) + + +def test_raises_on_missing_plan(patch_module): + run_test_plan = patch_module.module.run_test_plan + with pytest.raises(RuntimeError) as ei: + run_test_plan("nope", "cpu", tests_map={}) + + assert "test nope not found" in str(ei.value) + + +def test_aggregates_failures_and_raises(monkeypatch, patch_module): + run_test_plan = patch_module.module.run_test_plan + + tests_map = { + "mix": { + "title": "Some pass some fail", + "steps": [ + "pytest test_a.py", # 0 → pass + "pytest test_b.py", # 1 → fail + "pytest test_c.py", # 2 → fail + ], + } + } + + # Simulate pass, fail, fail + patch_module.run_command.side_effect = [0, 1, 2] + + with pytest.raises(RuntimeError) as ei: + run_test_plan("mix", "cpu", tests_map) + + msg = str(ei.value) + assert "2 pytest runs failed" in msg + # Ensure logger captured failed tests list + patch_module.logger.error.assert_called_once() + # And we attempted all three commands + assert patch_module.run_command.call_count == 3 + + +def test_custom_working_directory_used(patch_module): + run_test_plan = patch_module.module.run_test_plan + + tests_map = { + "customwd": { + "title": "Custom wd", + "working_directory": "examples/ci", + "steps": ["pytest -q"], + } + } + + patch_module.run_command.return_value = 0 + run_test_plan("customwd", "cpu", tests_map) + + assert patch_module.workdir_calls == ["examples/ci"] diff --git a/.ci/lumen_cli/tests/test_utils.py b/.ci/lumen_cli/tests/test_utils.py new file mode 100644 index 0000000000000..45ae5ad6d407b --- /dev/null +++ b/.ci/lumen_cli/tests/test_utils.py @@ -0,0 +1,143 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from cli.lib.common.utils import temp_environ, working_directory # <-- replace import + + +class EnvIsolatedTestCase(unittest.TestCase): + """Base class that snapshots os.environ and CWD for isolation.""" + + def setUp(self): + import os + import tempfile + + self._env_backup = dict(os.environ) + + # Snapshot/repair CWD if it's gone + try: + self._cwd_backup = os.getcwd() + except FileNotFoundError: + # If CWD no longer exists, switch to a safe place and record that + self._cwd_backup = tempfile.gettempdir() + os.chdir(self._cwd_backup) + + # Create a temporary directory for the test to run in + self._temp_dir = tempfile.mkdtemp() + os.chdir(self._temp_dir) + + def tearDown(self): + import os + import shutil + import tempfile + + # Restore cwd first (before cleaning up temp dir) + try: + os.chdir(self._cwd_backup) + except OSError: + os.chdir(tempfile.gettempdir()) + + # Clean up temporary directory + try: + shutil.rmtree(self._temp_dir, ignore_errors=True) + except Exception: + pass # Ignore cleanup errors + + # Restore env + to_del = set(os.environ.keys()) - set(self._env_backup.keys()) + for k in to_del: + os.environ.pop(k, None) + for k, v in self._env_backup.items(): + os.environ[k] = v + + +class TestTempEnviron(EnvIsolatedTestCase): + def test_sets_and_restores_new_var(self): + var = "TEST_TMP_ENV_NEW" + self.assertNotIn(var, os.environ) + + with temp_environ({var: "123"}): + self.assertEqual(os.environ[var], "123") + + self.assertNotIn(var, os.environ) # removed after exit + + def test_overwrites_and_restores_existing_var(self): + var = "TEST_TMP_ENV_OVERWRITE" + os.environ[var] = "orig" + + with temp_environ({var: "override"}): + self.assertEqual(os.environ[var], "override") + + self.assertEqual(os.environ[var], "orig") # restored + + def test_multiple_vars_and_missing_cleanup(self): + v1, v2 = "TEST_ENV_V1", "TEST_ENV_V2" + os.environ.pop(v1, None) + os.environ[v2] = "keep" + + with temp_environ({v1: "a", v2: "b"}): + self.assertEqual(os.environ[v1], "a") + self.assertEqual(os.environ[v2], "b") + + self.assertNotIn(v1, os.environ) # newly-added -> removed + self.assertEqual(os.environ[v2], "keep") # pre-existing -> restored + + def test_restores_even_on_exception(self): + var = "TEST_TMP_ENV_EXCEPTION" + self.assertNotIn(var, os.environ) + + with self.assertRaises(RuntimeError): + with temp_environ({var: "x"}): + self.assertEqual(os.environ[var], "x") + raise RuntimeError("boom") + + self.assertNotIn(var, os.environ) # removed after exception + + +class TestWorkingDirectory(EnvIsolatedTestCase): + def test_changes_and_restores(self): + start = Path.cwd() + with tempfile.TemporaryDirectory() as td: + target = Path(td) / "wd" + target.mkdir() + + with working_directory(str(target)): + self.assertEqual(Path.cwd().resolve(), target.resolve()) + + self.assertEqual(Path.cwd(), start) + + def test_noop_when_empty_path(self): + start = Path.cwd() + with working_directory(""): + self.assertEqual(Path.cwd(), start) + self.assertEqual(Path.cwd(), start) + + def test_restores_on_exception(self): + start = Path.cwd() + + with tempfile.TemporaryDirectory() as td: + target = Path(td) / "wd_exc" + target.mkdir() + + with self.assertRaises(ValueError): + with working_directory(str(target)): + # Normalize both sides to handle /var -> /private/var + self.assertEqual(Path.cwd().resolve(), target.resolve()) + raise ValueError("boom") + + self.assertEqual(Path.cwd().resolve(), start.resolve()) + + def test_raises_for_missing_dir(self): + start = Path.cwd() + with tempfile.TemporaryDirectory() as td: + missing = Path(td) / "does_not_exist" + with self.assertRaises(FileNotFoundError): + # os.chdir should raise before yielding + with working_directory(str(missing)): + pass + self.assertEqual(Path.cwd(), start) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/.ci/lumen_cli/tests/test_vllm.py b/.ci/lumen_cli/tests/test_vllm.py new file mode 100644 index 0000000000000..849eb0c40ee37 --- /dev/null +++ b/.ci/lumen_cli/tests/test_vllm.py @@ -0,0 +1,176 @@ +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +import cli.lib.core.vllm.vllm_build as vllm_build + + +_VLLM_BUILD_MODULE = "cli.lib.core.vllm.vllm_build" + + +class TestVllmBuildParameters(unittest.TestCase): + @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True) + @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=True) + @patch( + "cli.lib.common.envs_helper.env_path_optional", + side_effect=lambda name, default=None, resolve=True: { + "DOCKERFILE_PATH": Path("/abs/vllm/Dockerfile"), + "TORCH_WHEELS_PATH": Path("/abs/dist"), + "OUTPUT_DIR": Path("/abs/shared"), + }.get(name, Path(default) if default is not None else None), + ) + @patch.dict( + os.environ, + { + "USE_TORCH_WHEEL": "1", + "USE_LOCAL_BASE_IMAGE": "1", + "USE_LOCAL_DOCKERFILE": "1", + "BASE_IMAGE": "my/image:tag", + "DOCKERFILE_PATH": "vllm/Dockerfile", + "TORCH_WHEELS_PATH": "dist", + "OUTPUT_DIR": "shared", + }, + clear=True, + ) + def test_params_success_normalizes_and_validates( + self, mock_env_path, mock_is_path, mock_local_img + ): + params = vllm_build.VllmBuildParameters() + self.assertEqual(params.torch_whls_path, Path("/abs/dist")) + self.assertEqual(params.dockerfile_path, Path("/abs/vllm/Dockerfile")) + self.assertEqual(params.output_dir, Path("/abs/shared")) + self.assertEqual(params.base_image, "my/image:tag") + + @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False) + @patch.dict( + os.environ, {"USE_TORCH_WHEEL": "1", "TORCH_WHEELS_PATH": "dist"}, clear=True + ) + def test_params_missing_torch_whls_raises(self, _is_path): + with tempfile.TemporaryDirectory() as td: + os.chdir(td) + with self.assertRaises(ValueError) as cm: + vllm_build.VllmBuildParameters( + use_local_base_image=False, + use_local_dockerfile=False, + ) + err = cm.exception + self.assertIn("TORCH_WHEELS_PATH", str(err)) + + @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=False) + @patch.dict( + os.environ, {"USE_LOCAL_BASE_IMAGE": "1", "BASE_IMAGE": "img:tag"}, clear=True + ) + def test_params_missing_local_base_image_raises(self, _local_img): + with tempfile.TemporaryDirectory() as td: + os.chdir(td) + with self.assertRaises(ValueError) as cm: + vllm_build.VllmBuildParameters( + use_torch_whl=False, + use_local_dockerfile=False, + ) + err = cm.exception + self.assertIn("BASE_IMAGE", str(err)) + + @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False) + @patch.dict( + os.environ, + {"USE_LOCAL_DOCKERFILE": "1", "DOCKERFILE_PATH": "Dockerfile"}, + clear=True, + ) + def test_params_missing_dockerfile_raises(self, _is_path): + with tempfile.TemporaryDirectory() as td: + os.chdir(td) + with self.assertRaises(ValueError) as cm: + vllm_build.VllmBuildParameters( + use_torch_whl=False, + use_local_base_image=False, + ) + err = cm.exception + self.assertIn("DOCKERFILE_PATH", str(err)) + + @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False) + @patch.dict( + os.environ, + {"OUTPUT_DIR": ""}, + clear=True, + ) + def test_params_missing_output_dir(self, _is_path): + with self.assertRaises(FileNotFoundError): + vllm_build.VllmBuildParameters() + + +class TestBuildCmdAndRun(unittest.TestCase): + @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True) + def test_generate_docker_build_cmd_includes_bits(self, _exists): + runner = vllm_build.VllmBuildRunner() + inputs = MagicMock() + inputs.output_dir = Path("/abs/out") + inputs.use_local_base_image = True + inputs.base_image = "img:tag" + inputs.torch_whls_path = Path("./vllm/tmp") + inputs.max_jobs = 64 + inputs.cuda_version = "12.8.1" + inputs.python_version = "3.12" + inputs.sccache_bucket = "my-bucket" + inputs.sccache_region = "us-west-2" + inputs.torch_cuda_arch_list = "8.0;9.0" + inputs.target_stage = "export-wheels" + inputs.tag_name = "vllm-wheels" + + cmd = runner._generate_docker_build_cmd(inputs) + squashed = " ".join(cmd.split()) + + self.assertIn("--output type=local,dest=/abs/out", squashed) + self.assertIn("-f docker/Dockerfile.nightly_torch", squashed) + self.assertIn("--pull=false", squashed) + self.assertIn("--build-arg TORCH_WHEELS_PATH=tmp", squashed) + self.assertIn("--build-arg BUILD_BASE_IMAGE=img:tag", squashed) + self.assertIn("--build-arg FINAL_BASE_IMAGE=img:tag", squashed) + self.assertIn("--build-arg max_jobs=64", squashed) + self.assertIn("--build-arg CUDA_VERSION=12.8.1", squashed) + self.assertIn("--build-arg PYTHON_VERSION=3.12", squashed) + self.assertIn("--build-arg USE_SCCACHE=1", squashed) + self.assertIn("--build-arg SCCACHE_BUCKET_NAME=my-bucket", squashed) + self.assertIn("--build-arg SCCACHE_REGION_NAME=us-west-2", squashed) + self.assertIn("--build-arg torch_cuda_arch_list='8.0;9.0'", squashed) + self.assertIn("--target export-wheels", squashed) + self.assertIn("-t vllm-wheels", squashed) + + @patch(f"{_VLLM_BUILD_MODULE}.run_command") + @patch(f"{_VLLM_BUILD_MODULE}.ensure_dir_exists") + @patch(f"{_VLLM_BUILD_MODULE}.clone_vllm") + @patch.object( + vllm_build.VllmBuildRunner, + "_generate_docker_build_cmd", + return_value="docker buildx ...", + ) + @patch.dict( + os.environ, + { + "USE_TORCH_WHEEL": "0", + "USE_LOCAL_BASE_IMAGE": "0", + "USE_LOCAL_DOCKERFILE": "0", + "OUTPUT_DIR": "shared", + }, + clear=True, + ) + def test_run_calls_clone_prepare_and_build( + self, mock_gen, mock_clone, mock_ensure, mock_run + ): + params = MagicMock() + params.output_dir = Path("shared") + params.use_local_dockerfile = False + params.use_torch_whl = False + + with patch(f"{_VLLM_BUILD_MODULE}.VllmBuildParameters", return_value=params): + runner = vllm_build.VllmBuildRunner() + runner.run() + + mock_clone.assert_called_once() + mock_ensure.assert_called_once_with(Path("shared")) + mock_gen.assert_called_once_with(params) + mock_run.assert_called_once() + _, kwargs = mock_run.call_args + assert kwargs.get("cwd") == "vllm" diff --git a/.ci/magma/Makefile b/.ci/magma/Makefile index 5035e1ee3b2c6..4169aedd03fa5 100644 --- a/.ci/magma/Makefile +++ b/.ci/magma/Makefile @@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ magma/build_magma.sh .PHONY: all +all: magma-cuda130 all: magma-cuda129 all: magma-cuda128 all: magma-cuda126 @@ -25,6 +26,12 @@ clean: $(RM) -r magma-* $(RM) -r output +.PHONY: magma-cuda130 +magma-cuda130: DESIRED_CUDA := 13.0 +magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 +magma-cuda130: + $(DOCKER_RUN) + .PHONY: magma-cuda129 magma-cuda129: DESIRED_CUDA := 12.9 magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 diff --git a/.ci/magma/build_magma.sh b/.ci/magma/build_magma.sh index 3ac0bcaf1d5ba..6f1924fa45965 100755 --- a/.ci/magma/build_magma.sh +++ b/.ci/magma/build_magma.sh @@ -28,6 +28,7 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION} patch < ${PACKAGE_FILES}/CMake.patch patch < ${PACKAGE_FILES}/cmakelists.patch patch -p0 < ${PACKAGE_FILES}/thread_queue.patch +patch -p1 < ${PACKAGE_FILES}/cuda13.patch patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch # The build.sh script expects to be executed from the sources root folder @@ -37,6 +38,7 @@ popd # Package recipe, license and tarball # Folder and package name are backward compatible for the build workflow cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh +cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch diff --git a/.ci/magma/package_files/cuda13.patch b/.ci/magma/package_files/cuda13.patch new file mode 100644 index 0000000000000..d6ebaf9dfaae7 --- /dev/null +++ b/.ci/magma/package_files/cuda13.patch @@ -0,0 +1,26 @@ +diff --git a/interface_cuda/interface.cpp b/interface_cuda/interface.cpp +index 73fed1b20..e77519bfe 100644 +--- a/interface_cuda/interface.cpp ++++ b/interface_cuda/interface.cpp +@@ -438,14 +438,20 @@ magma_print_environment() + cudaDeviceProp prop; + err = cudaGetDeviceProperties( &prop, dev ); + check_error( err ); ++ #ifdef MAGMA_HAVE_CUDA ++#if CUDA_VERSION < 13000 + printf( "%% device %d: %s, %.1f MHz clock, %.1f MiB memory, capability %d.%d\n", + dev, + prop.name, + prop.clockRate / 1000., ++#else ++ printf( "%% device %d: %s, ??? MHz clock, %.1f MiB memory, capability %d.%d\n", ++ dev, ++ prop.name, ++#endif + prop.totalGlobalMem / (1024.*1024.), + prop.major, + prop.minor ); +- #ifdef MAGMA_HAVE_CUDA + int arch = prop.major*100 + prop.minor*10; + if ( arch < MAGMA_CUDA_ARCH_MIN ) { + printf("\n" diff --git a/.ci/manywheel/build.sh b/.ci/manywheel/build.sh index 4c4d51134715a..6b2a60bc5ca28 100755 --- a/.ci/manywheel/build.sh +++ b/.ci/manywheel/build.sh @@ -5,10 +5,6 @@ set -ex SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" case "${GPU_ARCH_TYPE:-BLANK}" in - BLANK) - # Legacy behavior for CircleCI - bash "${SCRIPTPATH}/build_cuda.sh" - ;; cuda) bash "${SCRIPTPATH}/build_cuda.sh" ;; diff --git a/.ci/manywheel/build_common.sh b/.ci/manywheel/build_common.sh index 49549c9f2994e..4c268befb30e5 100644 --- a/.ci/manywheel/build_common.sh +++ b/.ci/manywheel/build_common.sh @@ -138,28 +138,11 @@ fi echo "Calling setup.py bdist at $(date)" -if [[ "$USE_SPLIT_BUILD" == "true" ]]; then - echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)" - time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ - BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \ +time CMAKE_ARGS=${CMAKE_ARGS[@]} \ + EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR - echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)" - echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" - time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ - BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \ - BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ - USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ - CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR - echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" -else - time CMAKE_ARGS=${CMAKE_ARGS[@]} \ - EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ - BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ - USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ - python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR -fi echo "Finished setup.py bdist at $(date)" # Build libtorch packages @@ -272,10 +255,6 @@ ls /tmp/$WHEELHOUSE_DIR mkdir -p "/$WHEELHOUSE_DIR" mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/ -if [[ "$USE_SPLIT_BUILD" == "true" ]]; then - mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true -fi - if [[ -n "$BUILD_PYTHONLESS" ]]; then mkdir -p /$LIBTORCH_HOUSE_DIR mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR @@ -452,16 +431,8 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then pushd $PYTORCH_ROOT/test # Install the wheel for this Python version - if [[ "$USE_SPLIT_BUILD" == "true" ]]; then - pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true - fi - pip uninstall -y "$TORCH_PACKAGE_NAME" - if [[ "$USE_SPLIT_BUILD" == "true" ]]; then - pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v - fi - pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v # Print info on the libraries installed in this wheel diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh index 39586faa85f87..6ed38f8b25c62 100644 --- a/.ci/manywheel/build_cuda.sh +++ b/.ci/manywheel/build_cuda.sh @@ -66,6 +66,9 @@ case ${CUDA_VERSION} in TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX" fi ;; + 13.0) + TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" + ;; 12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0" ;; @@ -110,13 +113,18 @@ DEPS_SONAME=( ) -# CUDA_VERSION 12.6, 12.8, 12.9 -if [[ $CUDA_VERSION == 12* ]]; then +# CUDA_VERSION 12.*, 13.* +if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then export USE_STATIC_CUDNN=0 # Try parallelizing nvcc as well - export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2" + TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2" + # Compress the fatbin with -compress-mode=size for CUDA 13 + if [[ $CUDA_VERSION == 13* ]]; then + export TORCH_NVCC_FLAGS="$TORCH_NVCC_FLAGS -compress-mode=size" + fi if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then echo "Bundling with cudnn and cublas." + DEPS_LIST+=( "/usr/local/cuda/lib64/libcudnn_adv.so.9" "/usr/local/cuda/lib64/libcudnn_cnn.so.9" @@ -126,15 +134,11 @@ if [[ $CUDA_VERSION == 12* ]]; then "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9" "/usr/local/cuda/lib64/libcudnn_heuristic.so.9" "/usr/local/cuda/lib64/libcudnn.so.9" - "/usr/local/cuda/lib64/libcublas.so.12" - "/usr/local/cuda/lib64/libcublasLt.so.12" "/usr/local/cuda/lib64/libcusparseLt.so.0" - "/usr/local/cuda/lib64/libcudart.so.12" - "/usr/local/cuda/lib64/libnvrtc.so.12" "/usr/local/cuda/lib64/libnvrtc-builtins.so" "/usr/local/cuda/lib64/libcufile.so.0" "/usr/local/cuda/lib64/libcufile_rdma.so.1" - "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12" + "/usr/local/cuda/lib64/libnvshmem_host.so.3" "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so" ) DEPS_SONAME+=( @@ -146,41 +150,83 @@ if [[ $CUDA_VERSION == 12* ]]; then "libcudnn_engines_precompiled.so.9" "libcudnn_heuristic.so.9" "libcudnn.so.9" - "libcublas.so.12" - "libcublasLt.so.12" "libcusparseLt.so.0" - "libcudart.so.12" - "libnvrtc.so.12" "libnvrtc-builtins.so" + "libnvshmem_host.so.3" "libcufile.so.0" "libcufile_rdma.so.1" - "libcupti.so.12" "libnvperf_host.so" ) # Add libnvToolsExt only if CUDA version is not 12.9 - if [[ $CUDA_VERSION != 12.9* ]]; then - DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1") - DEPS_SONAME+=("libnvToolsExt.so.1") + if [[ $CUDA_VERSION == 13* ]]; then + DEPS_LIST+=( + "/usr/local/cuda/lib64/libcublas.so.13" + "/usr/local/cuda/lib64/libcublasLt.so.13" + "/usr/local/cuda/lib64/libcudart.so.13" + "/usr/local/cuda/lib64/libnvrtc.so.13" + "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13" + "/usr/local/cuda/lib64/libibverbs.so.1" + "/usr/local/cuda/lib64/librdmacm.so.1" + "/usr/local/cuda/lib64/libmlx5.so.1" + "/usr/local/cuda/lib64/libnl-3.so.200" + "/usr/local/cuda/lib64/libnl-route-3.so.200") + DEPS_SONAME+=( + "libcublas.so.13" + "libcublasLt.so.13" + "libcudart.so.13" + "libnvrtc.so.13" + "libcupti.so.13" + "libibverbs.so.1" + "librdmacm.so.1" + "libmlx5.so.1" + "libnl-3.so.200" + "libnl-route-3.so.200") + export USE_CUPTI_SO=1 + export ATEN_STATIC_CUDA=0 + export USE_CUDA_STATIC_LINK=0 + export USE_CUFILE=0 + else + DEPS_LIST+=( + "/usr/local/cuda/lib64/libnvToolsExt.so.1" + "/usr/local/cuda/lib64/libcublas.so.12" + "/usr/local/cuda/lib64/libcublasLt.so.12" + "/usr/local/cuda/lib64/libcudart.so.12" + "/usr/local/cuda/lib64/libnvrtc.so.12" + "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12") + DEPS_SONAME+=( + "libnvToolsExt.so.1" + "libcublas.so.12" + "libcublasLt.so.12" + "libcudart.so.12" + "libnvrtc.so.12" + "libcupti.so.12") fi else echo "Using nvidia libs from pypi." CUDA_RPATHS=( - '$ORIGIN/../../nvidia/cublas/lib' - '$ORIGIN/../../nvidia/cuda_cupti/lib' - '$ORIGIN/../../nvidia/cuda_nvrtc/lib' - '$ORIGIN/../../nvidia/cuda_runtime/lib' '$ORIGIN/../../nvidia/cudnn/lib' - '$ORIGIN/../../nvidia/cufft/lib' - '$ORIGIN/../../nvidia/curand/lib' - '$ORIGIN/../../nvidia/cusolver/lib' - '$ORIGIN/../../nvidia/cusparse/lib' - '$ORIGIN/../../nvidia/cusparselt/lib' - '$ORIGIN/../../cusparselt/lib' - '$ORIGIN/../../nvidia/nccl/lib' '$ORIGIN/../../nvidia/nvshmem/lib' - '$ORIGIN/../../nvidia/nvtx/lib' - '$ORIGIN/../../nvidia/cufile/lib' + '$ORIGIN/../../nvidia/nccl/lib' + '$ORIGIN/../../nvidia/cusparselt/lib' ) + if [[ $CUDA_VERSION == 13* ]]; then + CUDA_RPATHS+=('$ORIGIN/../../nvidia/cu13/lib') + else + CUDA_RPATHS+=( + '$ORIGIN/../../nvidia/cublas/lib' + '$ORIGIN/../../nvidia/cuda_cupti/lib' + '$ORIGIN/../../nvidia/cuda_nvrtc/lib' + '$ORIGIN/../../nvidia/cuda_runtime/lib' + '$ORIGIN/../../nvidia/cufft/lib' + '$ORIGIN/../../nvidia/curand/lib' + '$ORIGIN/../../nvidia/cusolver/lib' + '$ORIGIN/../../nvidia/cusparse/lib' + '$ORIGIN/../../cusparselt/lib' + '$ORIGIN/../../nvidia/nvtx/lib' + '$ORIGIN/../../nvidia/cufile/lib' + ) + fi + CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}") export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib' export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN' diff --git a/.ci/manywheel/build_xpu.sh b/.ci/manywheel/build_xpu.sh index ff157b1c0b205..bd7b168be336c 100755 --- a/.ci/manywheel/build_xpu.sh +++ b/.ci/manywheel/build_xpu.sh @@ -25,6 +25,7 @@ source /opt/intel/oneapi/mpi/latest/env/vars.sh export USE_STATIC_MKL=1 export USE_ONEMKL=1 export USE_XCCL=1 +export USE_MPI=0 WHEELHOUSE_DIR="wheelhousexpu" LIBTORCH_HOUSE_DIR="libtorch_housexpu" diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index a7ce0fef736cf..1c88554c2af96 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -50,9 +50,6 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then export ATEN_THREADING=NATIVE fi -# Enable LLVM dependency for TensorExpr testing -export USE_LLVM=/opt/llvm -export LLVM_DIR=/opt/llvm/lib/cmake/llvm if ! which conda; then # In ROCm CIs, we are doing cross compilation on build machines with @@ -95,6 +92,27 @@ if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then export ACL_ROOT_DIR=/ComputeLibrary fi +if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then + if [[ -f /opt/riscv-cross-env/bin/activate ]]; then + # shellcheck disable=SC1091 + source /opt/riscv-cross-env/bin/activate + else + echo "Activation file not found" + exit 1 + fi + + export CMAKE_CROSSCOMPILING=TRUE + export CMAKE_SYSTEM_NAME=Linux + export CMAKE_SYSTEM_PROCESSOR=riscv64 + + export USE_CUDA=0 + export USE_MKLDNN=0 + + export SLEEF_TARGET_EXEC_USE_QEMU=ON + sudo chown -R jenkins /var/lib/jenkins/workspace /opt + +fi + if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then POSSIBLE_JAVA_HOMES=() POSSIBLE_JAVA_HOMES+=(/usr/local) @@ -155,6 +173,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then source /opt/intel/oneapi/mpi/latest/env/vars.sh # Enable XCCL build export USE_XCCL=1 + export USE_MPI=0 # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA export USE_KINETO=0 export TORCH_XPU_ARCH_LIST=pvc @@ -176,8 +195,16 @@ fi # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of # memory to build and will OOM -if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then - export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2" + +if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then + J=2 # default to 2 jobs + case "$RUNNER" in + linux.12xlarge.memory|linux.24xlarge.memory) + J=24 + ;; + esac + echo "Building FlashAttention with job limit $J" + export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j ${J}" fi if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then @@ -192,7 +219,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then export USE_ASAN=1 export REL_WITH_DEB_INFO=1 export UBSAN_FLAGS="-fno-sanitize-recover=all" - unset USE_LLVM fi if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then @@ -213,7 +239,7 @@ fi # Do not change workspace permissions for ROCm and s390x CI jobs # as it can leave workspace with bad permissions for cancelled jobs -if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then +if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && -d /var/lib/jenkins/workspace ]]; then # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96) WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace") cleanup_workspace() { @@ -258,29 +284,19 @@ else # XLA test build fails when WERROR=1 # set only when building other architectures # or building non-XLA tests. - if [[ "$BUILD_ENVIRONMENT" != *rocm* && - "$BUILD_ENVIRONMENT" != *xla* ]]; then + if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then # Install numpy-2.0.2 for builds which are backward compatible with 1.X python -mpip install numpy==2.0.2 WERROR=1 python setup.py clean - if [[ "$USE_SPLIT_BUILD" == "true" ]]; then - python3 tools/packaging/split_wheel.py bdist_wheel - else - WERROR=1 python setup.py bdist_wheel - fi + WERROR=1 python setup.py bdist_wheel else python setup.py clean if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then source .ci/pytorch/install_cache_xla.sh fi - if [[ "$USE_SPLIT_BUILD" == "true" ]]; then - echo "USE_SPLIT_BUILD cannot be used with xla or rocm" - exit 1 - else - python setup.py bdist_wheel - fi + python setup.py bdist_wheel fi pip_install_whl "$(echo dist/*.whl)" @@ -405,7 +421,7 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build python tools/stats/export_test_times.py fi -# don't do this for bazel or s390x as they don't use sccache -if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then +# don't do this for bazel or s390x or riscv64 as they don't use sccache +if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then print_sccache_stats fi diff --git a/.ci/pytorch/check_binary.sh b/.ci/pytorch/check_binary.sh index 78baf6a0761d7..0f632f8006c07 100755 --- a/.ci/pytorch/check_binary.sh +++ b/.ci/pytorch/check_binary.sh @@ -300,24 +300,3 @@ except RuntimeError as e: exit 1 fi fi - -############################################################################### -# Check for C++ ABI compatibility to GCC-11 - GCC 13 -############################################################################### -if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then - pushd /tmp - # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html - # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19 - # gcc 11 - CUDA 11.8, xpu, rocm - # gcc 13 - CUDA 12.6, 12.8 and cpu - # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426 - if [[ "$(uname -m)" == "s390x" ]]; then - cxx_abi="19" - elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then - cxx_abi="18" - else - cxx_abi="16" - fi - python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)" - popd -fi diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index 4771544b8b9b1..edfff60744919 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -149,6 +149,19 @@ function get_pinned_commit() { cat .github/ci_commit_pins/"${1}".txt } +function detect_cuda_arch() { + if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then + if command -v nvidia-smi; then + TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1) + elif [[ "${TEST_CONFIG}" == *nogpu* ]]; then + # There won't be nvidia-smi in nogpu tests, so just set TORCH_CUDA_ARCH_LIST to the default + # minimum supported value here + TORCH_CUDA_ARCH_LIST=8.0 + fi + export TORCH_CUDA_ARCH_LIST + fi +} + function install_torchaudio() { local commit commit=$(get_pinned_commit audio) @@ -229,7 +242,6 @@ function install_torchrec_and_fbgemm() { pip_install tabulate # needed for newer fbgemm pip_install patchelf # needed for rocm fbgemm - pushd /tmp local wheel_dir=dist/fbgemm_gpu local found_whl=0 @@ -264,7 +276,6 @@ function install_torchrec_and_fbgemm() { done rm -rf fbgemm - popd else pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu @@ -283,30 +294,6 @@ function clone_pytorch_xla() { fi } -function checkout_install_torchbench() { - local commit - commit=$(get_pinned_commit torchbench) - git clone https://github.com/pytorch/benchmark torchbench - pushd torchbench - git checkout "$commit" - - if [ "$1" ]; then - python install.py --continue_on_fail models "$@" - else - # Occasionally the installation may fail on one model but it is ok to continue - # to install and test other models - python install.py --continue_on_fail - fi - - # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488 - # is regressing speedup metric. This needs to be investigated further - pip install transformers==4.38.1 - - echo "Print all dependencies after TorchBench is installed" - python -mpip freeze - popd -} - function install_torchao() { local commit commit=$(get_pinned_commit torchao) diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh index d7447e7d48582..d41c3c08e6288 100755 --- a/.ci/pytorch/macos-build.sh +++ b/.ci/pytorch/macos-build.sh @@ -35,11 +35,10 @@ fi print_cmake_info if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then - # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls - USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel + USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel else - # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests - # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448 + # NB: we always build with distributed; USE_DISTRIBUTED turns off all + # backends (specifically the gloo backend), so test that this case works too USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64 fi if which sccache > /dev/null; then diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh index 83f8e4e04331d..79d47da431712 100755 --- a/.ci/pytorch/macos-test.sh +++ b/.ci/pytorch/macos-test.sh @@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available( fi popd +python -mpip install -r requirements.txt + # enable debug asserts in serialization export TORCH_SERIALIZATION_DEBUG=1 +python -mpip install --no-input -r requirements.txt + setup_test_python() { # The CircleCI worker hostname doesn't resolve to an address. # This environment variable makes ProcessGroupGloo default to @@ -157,6 +161,34 @@ test_jit_hooks() { assert_git_not_dirty } +# Shellcheck doesn't like it when you pass no arguments to a function +# that can take args. See https://www.shellcheck.net/wiki/SC2120 +# shellcheck disable=SC2120 +checkout_install_torchbench() { + local commit + commit=$(cat .ci/docker/ci_commit_pins/torchbench.txt) + git clone https://github.com/pytorch/benchmark torchbench + pushd torchbench + git checkout "$commit" + + if [ "$1" ]; then + python install.py --continue_on_fail models "$@" + else + # Occasionally the installation may fail on one model but it is ok to continue + # to install and test other models + python install.py --continue_on_fail + fi + popd + + pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt + # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because + # its current version 0.12.0 doesn't work with transformers 4.54.0 + pip uninstall -y torchao + + echo "Print all dependencies after TorchBench is installed" + python -mpip freeze +} + torchbench_setup_macos() { git clone --recursive https://github.com/pytorch/vision torchvision git clone --recursive https://github.com/pytorch/audio torchaudio @@ -167,7 +199,7 @@ torchbench_setup_macos() { git checkout "$(cat ../.github/ci_commit_pins/vision.txt)" git submodule update --init --recursive python setup.py clean - python setup.py develop + python -m pip install -e . -v --no-build-isolation popd pushd torchaudio @@ -176,11 +208,9 @@ torchbench_setup_macos() { git submodule update --init --recursive python setup.py clean #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp - USE_OPENMP=0 python setup.py develop + USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation popd - # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120 - # shellcheck disable=SC2119,SC2120 checkout_install_torchbench } @@ -276,6 +306,47 @@ test_torchbench_smoketest() { fi done + echo "Pytorch benchmark on mps device completed" +} + +test_aoti_torchbench_smoketest() { + print_cmake_info + + echo "Launching AOTInductor torchbench setup" + pip_benchmark_deps + # shellcheck disable=SC2119,SC2120 + torchbench_setup_macos + + TEST_REPORTS_DIR=$(pwd)/test/test-reports + mkdir -p "$TEST_REPORTS_DIR" + + local device=mps + local dtypes=(undefined float16 bfloat16 notset) + local dtype=${dtypes[$1]} + local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16) + + echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}" + local dtype_arg="--${dtype}" + if [ "$dtype" == notset ]; then + dtype_arg="--float32" + fi + touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" + for model in "${models[@]}"; do + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \ + --performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \ + --accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true + done + + echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}" + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \ + --performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \ + --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true echo "Pytorch benchmark on mps device completed" } @@ -324,6 +395,8 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then test_timm_perf elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then test_torchbench_smoketest "${SHARD_NUMBER}" +elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then + test_aoti_torchbench_smoketest "${SHARD_NUMBER}" elif [[ $TEST_CONFIG == *"mps"* ]]; then test_python_mps elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh index 1a0f44b8f98a3..219463f318dbd 100755 --- a/.ci/pytorch/multigpu-test.sh +++ b/.ci/pytorch/multigpu-test.sh @@ -45,6 +45,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then # DTensor tests time python test/run_test.py --verbose -i distributed/tensor/test_random_ops time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile + time python test/run_test.py --verbose -i distributed/tensor/test_utils.py # DeviceMesh test time python test/run_test.py --verbose -i distributed/test_device_mesh diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 9f2a67b4ff45b..e0d47259676b7 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -91,6 +91,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then export VALGRIND=OFF fi +detect_cuda_arch if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then # There are additional warnings on s390x, maybe due to newer gcc. @@ -495,6 +496,14 @@ test_inductor_cpp_wrapper_shard() { -k 'take' \ --shard "$1" "$NUM_TEST_SHARDS" \ --verbose + + if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then + python test/run_test.py \ + --include inductor/test_mkldnn_pattern_matcher \ + -k 'xpu' \ + --shard "$1" "$NUM_TEST_SHARDS" \ + --verbose + fi } # "Global" flags for inductor benchmarking controlled by TEST_CONFIG @@ -1051,20 +1060,10 @@ test_libtorch_api() { mkdir -p $TEST_REPORTS_DIR OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml - "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml else # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest" - # On s390x, pytorch is built without llvm. - # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and - # test fails with errors like: - # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer - # unknown file: Failure - # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) } - if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then - python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr - fi fi # quantization is not fully supported on s390x yet @@ -1639,6 +1638,10 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then install_torchvision build_xla test_xla +elif [[ "$TEST_CONFIG" == *vllm* ]]; then + echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST" + (cd .ci/lumen_cli && python -m pip install -e .) + python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS" elif [[ "${TEST_CONFIG}" == *executorch* ]]; then test_executorch elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then @@ -1684,43 +1687,34 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then elif [[ "${TEST_CONFIG}" == cachebench ]]; then install_torchaudio install_torchvision - checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco - PYTHONPATH=$(pwd)/torchbench test_cachebench + PYTHONPATH=/torchbench test_cachebench elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then install_torchaudio install_torchvision - checkout_install_torchbench nanogpt - PYTHONPATH=$(pwd)/torchbench test_verify_cachebench + PYTHONPATH=/torchbench test_verify_cachebench elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then install_torchaudio install_torchvision - install_torchao id=$((SHARD_NUMBER-1)) # https://github.com/opencv/opencv-python/issues/885 pip_install opencv-python==4.8.0.74 if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then - checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer - PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf + PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then - checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \ - llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \ - functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0 - PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf + PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then - checkout_install_torchbench - TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest + TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest else - checkout_install_torchbench # Do this after checkout_install_torchbench to ensure we clobber any # nightlies that torchbench may pull in if [[ "${TEST_CONFIG}" != *cpu* ]]; then install_torchrec_and_fbgemm fi - PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id" + PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id" fi elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then install_torchvision - PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER" + PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER" if [[ "$SHARD_NUMBER" -eq "1" ]]; then test_inductor_aoti fi diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat index 7ceb425ce2d1a..19d715b9d0b6d 100644 --- a/.ci/pytorch/win-test-helpers/build_pytorch.bat +++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat @@ -61,9 +61,10 @@ if "%USE_XPU%"=="1" ( call "C:\Program Files (x86)\Intel\oneAPI\compiler\latest\env\vars.bat" call "C:\Program Files (x86)\Intel\oneAPI\ocloc\latest\env\vars.bat" if errorlevel 1 exit /b 1 - :: Reduce build time. Only have MTL self-hosted runner now - SET TORCH_XPU_ARCH_LIST=xe-lpg - SET USE_KINETO=0 + :: Reduce build time + SET TORCH_XPU_ARCH_LIST=bmg + :: Re-setup python env for build + call pip install -r requirements.txt ) @echo on diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh index be7f3e4bb35cc..43524dc04e3fb 100755 --- a/.ci/pytorch/win-test.sh +++ b/.ci/pytorch/win-test.sh @@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard== python -m pip install z3-solver==4.15.1.0 # Install tlparse for test\dynamo\test_structured_trace.py UTs. -python -m pip install tlparse==0.3.30 +python -m pip install tlparse==0.4.0 # Install parameterized python -m pip install parameterized==0.8.1 diff --git a/.ci/pytorch/windows/cuda126.bat b/.ci/pytorch/windows/cuda126.bat index dd30cc25d4a66..efb8cfec63e7e 100644 --- a/.ci/pytorch/windows/cuda126.bat +++ b/.ci/pytorch/windows/cuda126.bat @@ -37,7 +37,7 @@ IF "%CUDA_PATH_V126%"=="" ( ) IF "%BUILD_VISION%" == "" ( - set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0 + set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0 set TORCH_NVCC_FLAGS=-Xfatbin -compress-all ) ELSE ( set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 diff --git a/.ci/pytorch/windows/cuda130.bat b/.ci/pytorch/windows/cuda130.bat new file mode 100644 index 0000000000000..f38cd789f2da6 --- /dev/null +++ b/.ci/pytorch/windows/cuda130.bat @@ -0,0 +1,59 @@ +@echo off + +set MODULE_NAME=pytorch + +IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" ( + call internal\clone.bat + cd %~dp0 +) ELSE ( + call internal\clean.bat +) +IF ERRORLEVEL 1 goto :eof + +call internal\check_deps.bat +IF ERRORLEVEL 1 goto :eof + +REM Check for optional components + +set USE_CUDA= +set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 + +IF "%NVTOOLSEXT_PATH%"=="" ( + IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" ( + set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt + ) ELSE ( + echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing + exit /b 1 + ) +) + +IF "%CUDA_PATH_V130%"=="" ( + IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\nvcc.exe" ( + set "CUDA_PATH_V130=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0" + ) ELSE ( + echo CUDA 13.0 not found, failing + exit /b 1 + ) +) + +IF "%BUILD_VISION%" == "" ( + set TORCH_CUDA_ARCH_LIST=7.5;8.0;8.6;9.0;10.0;12.0 + set TORCH_NVCC_FLAGS=-Xfatbin -compress-all +) ELSE ( + set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120 +) + +set "CUDA_PATH=%CUDA_PATH_V130%" +set "PATH=%CUDA_PATH_V130%\bin;%PATH%" + +:optcheck + +call internal\check_opts.bat +IF ERRORLEVEL 1 goto :eof + +if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\.. +call %~dp0\internal\copy.bat +IF ERRORLEVEL 1 goto :eof + +call %~dp0\internal\setup.bat +IF ERRORLEVEL 1 goto :eof diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat index 40f2bd7acdbb9..e0281c0d78a44 100644 --- a/.ci/pytorch/windows/internal/copy.bat +++ b/.ci/pytorch/windows/internal/copy.bat @@ -1,12 +1,20 @@ -copy "%CUDA_PATH%\bin\cusparse*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\cublas*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\cudart*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\curand*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\cufft*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\cusolver*64_*.dll*" pytorch\torch\lib + +if %CUDA_VERSION% geq 130 ( + set "dll_path=bin\x64" +) else ( + set "dll_path=bin" +) + +copy "%CUDA_PATH%\%dll_path%\cusparse*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\cublas*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\cudart*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\curand*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\cufft*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\cusolver*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\nvrtc*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\nvJitLink_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\extras\CUPTI\lib64\nvperf_host*.dll*" pytorch\torch\lib @@ -20,8 +28,3 @@ copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib if exist "C:\Windows\System32\zlibwapi.dll" ( copy "C:\Windows\System32\zlibwapi.dll" pytorch\torch\lib ) - -::copy nvJitLink dll is requires for cuda 12+ -if exist "%CUDA_PATH%\bin\nvJitLink_*.dll*" ( - copy "%CUDA_PATH%\bin\nvJitLink_*.dll*" pytorch\torch\lib -) diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat index a0eb650f8506a..1349d3e661f55 100644 --- a/.ci/pytorch/windows/internal/cuda_install.bat +++ b/.ci/pytorch/windows/internal/cuda_install.bat @@ -26,6 +26,7 @@ if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR% if %CUDA_VER% EQU 126 goto cuda126 if %CUDA_VER% EQU 128 goto cuda128 if %CUDA_VER% EQU 129 goto cuda129 +if %CUDA_VER% EQU 130 goto cuda130 echo CUDA %CUDA_VERSION_STR% is not supported exit /b 1 @@ -113,6 +114,33 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" goto cuda_common +:cuda130 + +set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe +if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( + curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore + if errorlevel 1 exit /b 1 + set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" + set "ARGS=" +) + +set CUDNN_FOLDER=cudnn-windows-x86_64-9.12.0.46_cuda13-archive +set CUDNN_LIB_FOLDER="lib" +set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" +if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( + curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore + if errorlevel 1 exit /b 1 + set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" +) + +@REM cuDNN 8.3+ required zlib to be installed on the path +echo Installing ZLIB dlls +curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" +7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" +xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" + +goto cuda_common + :cuda_common :: NOTE: We only install CUDA if we don't have it installed already. :: With GHA runners these should be pre-installed as part of our AMI process diff --git a/.ci/pytorch/windows/internal/driver_update.bat b/.ci/pytorch/windows/internal/driver_update.bat index 5ed3a236c09a0..2c173aed818b4 100644 --- a/.ci/pytorch/windows/internal/driver_update.bat +++ b/.ci/pytorch/windows/internal/driver_update.bat @@ -1,9 +1,9 @@ -set WIN_DRIVER_VN=528.89 -set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore -curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe +set WIN_DRIVER_VN=580.88 +set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore +curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe if errorlevel 1 exit /b 1 -start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot +start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot if errorlevel 1 exit /b 1 -del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL +del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL diff --git a/.ci/pytorch/windows/internal/install_python.bat b/.ci/pytorch/windows/internal/install_python.bat index 73622bd736edd..84d0f9caccefb 100644 --- a/.ci/pytorch/windows/internal/install_python.bat +++ b/.ci/pytorch/windows/internal/install_python.bat @@ -1,12 +1,22 @@ set ADDITIONAL_OPTIONS="" set PYTHON_EXEC="python" + + if "%DESIRED_PYTHON%" == "3.13t" ( echo Python version is set to 3.13t set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe" set ADDITIONAL_OPTIONS="Include_freethreaded=1" set PYTHON_EXEC="python3.13t" +) else if "%DESIRED_PYTHON%"=="3.14" ( + echo Python version is set to 3.14 or 3.14t + set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe" +) else if "%DESIRED_PYTHON%"=="3.14t" ( + echo Python version is set to 3.14 or 3.14t + set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe" + set ADDITIONAL_OPTIONS="Include_freethreaded=1" + set PYTHON_EXEC="python3.14t" ) else ( - echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON% + echo Python version is set to %DESIRED_PYTHON% set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =% ) diff --git a/.ci/pytorch/windows/internal/xpu_install.bat b/.ci/pytorch/windows/internal/xpu_install.bat index 2296adf4dfe66..f143571a56922 100644 --- a/.ci/pytorch/windows/internal/xpu_install.bat +++ b/.ci/pytorch/windows/internal/xpu_install.bat @@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build" :xpu_bundle_install_start set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI -set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe +set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product -set XPU_BUNDLE_VERSION=2025.0.1+20 +set XPU_BUNDLE_VERSION=2025.1.3+5 set XPU_BUNDLE_INSTALLED=0 set XPU_BUNDLE_UNINSTALL=0 set XPU_EXTRA_URL=NULL @@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226 set XPU_EXTRA_INSTALLED=0 set XPU_EXTRA_UNINSTALL=0 -if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] ( - set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe - set XPU_BUNDLE_VERSION=2025.1.3+5 +if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] ( + set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe + set XPU_BUNDLE_VERSION=2025.2.1+20 ) :: Check if XPU bundle is target version or already installed @@ -90,14 +90,3 @@ if errorlevel 1 exit /b 1 del xpu_extra.exe :xpu_install_end - -if not "%XPU_ENABLE_KINETO%"=="1" goto install_end -:: Install Level Zero SDK -set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip -curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip" -echo "Installing level zero SDK..." -7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero" -set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%" -del "%SRC_DIR%\temp_build\level_zero_sdk.zip" - -:install_end diff --git a/.ci/pytorch/windows/setup_build.bat b/.ci/pytorch/windows/setup_build.bat index 9b492eef664d7..dbdc9891324cc 100644 --- a/.ci/pytorch/windows/setup_build.bat +++ b/.ci/pytorch/windows/setup_build.bat @@ -7,6 +7,8 @@ call "internal\install_python.bat" %PYTHON_EXEC% --version set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%" +if "%DESIRED_PYTHON%" == "3.14t" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake +if "%DESIRED_PYTHON%" == "3.14" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh index 878d6595c84c0..763fce4b73e18 100755 --- a/.ci/wheel/build_wheel.sh +++ b/.ci/wheel/build_wheel.sh @@ -124,20 +124,31 @@ popd export TH_BINARY_BUILD=1 export INSTALL_TEST=0 # dont install test binaries into site-packages -export MACOSX_DEPLOYMENT_TARGET=10.15 +export MACOSX_DEPLOYMENT_TARGET=11.0 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} -SETUPTOOLS_PINNED_VERSION="==70.1.0" -PYYAML_PINNED_VERSION="=5.3" EXTRA_CONDA_INSTALL_FLAGS="" CONDA_ENV_CREATE_FLAGS="" RENAME_WHEEL=true case $desired_python in + 3.14t) + echo "Using 3.14 deps" + NUMPY_PINNED_VERSION="==2.1.0" + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + RENAME_WHEEL=false + ;; + 3.14) + echo "Using 3.14t deps" + NUMPY_PINNED_VERSION="==2.1.0" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + RENAME_WHEEL=false + ;; 3.13t) echo "Using 3.13 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=6.0.1" - NUMPY_PINNED_VERSION="=2.1.0" + NUMPY_PINNED_VERSION="==2.1.0" CONDA_ENV_CREATE_FLAGS="python-freethreading" EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" desired_python="3.13" @@ -145,37 +156,23 @@ case $desired_python in ;; 3.13) echo "Using 3.13 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=6.0.1" - NUMPY_PINNED_VERSION="=2.1.0" + NUMPY_PINNED_VERSION="==2.1.0" ;; 3.12) echo "Using 3.12 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=6.0.1" - NUMPY_PINNED_VERSION="=2.0.2" + NUMPY_PINNED_VERSION="==2.0.2" ;; 3.11) echo "Using 3.11 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=5.3" - NUMPY_PINNED_VERSION="=2.0.2" + NUMPY_PINNED_VERSION="==2.0.2" ;; 3.10) echo "Using 3.10 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=5.3" - NUMPY_PINNED_VERSION="=2.0.2" - ;; - 3.9) - echo "Using 3.9 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=5.3" - NUMPY_PINNED_VERSION="=2.0.2" + NUMPY_PINNED_VERSION="==2.0.2" ;; *) - echo "Using default deps" - NUMPY_PINNED_VERSION="=1.11.3" + echo "Unsupported version $desired_python" + exit 1 ;; esac @@ -184,17 +181,18 @@ tmp_env_name="wheel_py$python_nodot" conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} source activate "$tmp_env_name" -retry pip install -r "${pytorch_rootdir}/requirements-build.txt" -pip install "numpy=${NUMPY_PINNED_VERSION}" "pyyaml${PYYAML_PINNED_VERSION}" requests ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing-extensions +PINNED_PACKAGES=( + "numpy${NUMPY_PINNED_VERSION}" +) +retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt" +pip install requests ninja typing-extensions retry pip install -r "${pytorch_rootdir}/requirements.txt" || true retry brew install libomp -# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule +# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which +# is build as part of tensorpipe submodule export USE_DISTRIBUTED=1 -if [[ -n "$CROSS_COMPILE_ARM64" ]]; then - export CMAKE_OSX_ARCHITECTURES=arm64 -fi export USE_MKLDNN=OFF export USE_QNNPACK=OFF export BUILD_TEST=OFF @@ -202,16 +200,7 @@ export BUILD_TEST=OFF pushd "$pytorch_rootdir" echo "Calling setup.py bdist_wheel at $(date)" -if [[ "$USE_SPLIT_BUILD" == "true" ]]; then - echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)" - BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir" - echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)" - echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" - BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir" - echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" -else - python setup.py bdist_wheel -d "$whl_tmp_dir" -fi +python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version} echo "Finished setup.py bdist_wheel at $(date)" diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh index 11678cabb2c31..c24a50b8b17ed 100755 --- a/.circleci/scripts/binary_linux_test.sh +++ b/.circleci/scripts/binary_linux_test.sh @@ -65,16 +65,8 @@ fi if [[ "$PACKAGE_TYPE" != libtorch ]]; then if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then - if [[ "$USE_SPLIT_BUILD" == "true" ]]; then - pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)" - pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)" - # todo: after folder is populated use the pypi_pkg channel instead - pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg" - retry pip install -q numpy protobuf typing-extensions - else - pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}" - retry pip install -q numpy protobuf typing-extensions - fi + pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}" + retry pip install -q numpy protobuf typing-extensions else pip install "\$pkg" retry pip install -q numpy protobuf typing-extensions diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 0257c5843e80e..27f0a37f3fb48 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -77,8 +77,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'" -# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries. -if [[ "$DESIRED_CUDA" == "cu129" ]]; then +# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries. +if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then TRITON_CONSTRAINT="platform_system == 'Linux'" fi @@ -137,7 +137,6 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}" export DESIRED_CUDA="$DESIRED_CUDA" export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}" export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}" -export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}" if [[ "${OSTYPE}" == "msys" ]]; then export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}" if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then diff --git a/.circleci/scripts/binary_upload.sh b/.circleci/scripts/binary_upload.sh index cf87748d538ce..d48077e112455 100755 --- a/.circleci/scripts/binary_upload.sh +++ b/.circleci/scripts/binary_upload.sh @@ -23,10 +23,6 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then AWS_S3_CP="aws s3 cp" fi -if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then - UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg" -fi - # this is special build with all dependencies packaged if [[ ${BUILD_NAME} == *-full* ]]; then UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full" @@ -55,16 +51,12 @@ s3_upload() { s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/" fi ( - cache_control_flag="" - if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then - cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'" - fi for pkg in ${PKG_DIR}/*.${extension}; do ( set -x shm_id=$(sha256sum "${pkg}" | awk '{print $1}') ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \ - --metadata "checksum-sha256=${shm_id}" ${cache_control_flag} + --metadata "checksum-sha256=${shm_id}" ) done ) diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh index 27cd36f949280..18dcde50e2b65 100644 --- a/.circleci/scripts/binary_windows_build.sh +++ b/.circleci/scripts/binary_windows_build.sh @@ -15,8 +15,7 @@ fi if [[ "$DESIRED_CUDA" == 'xpu' ]]; then export VC_YEAR=2022 export USE_SCCACHE=0 - export XPU_VERSION=2025.1 - export XPU_ENABLE_KINETO=1 + export XPU_VERSION=2025.2 fi echo "Free space on filesystem before build:" diff --git a/.circleci/scripts/binary_windows_test.sh b/.circleci/scripts/binary_windows_test.sh index 79f714265f2c2..9326d9037e8b3 100644 --- a/.circleci/scripts/binary_windows_test.sh +++ b/.circleci/scripts/binary_windows_test.sh @@ -8,7 +8,7 @@ export VC_YEAR=2022 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then export VC_YEAR=2022 - export XPU_VERSION=2025.1 + export XPU_VERSION=2025.2 fi pushd "$PYTORCH_ROOT/.ci/pytorch/" diff --git a/.flake8 b/.flake8 index 3e8a6c3a5115a..fc9ab167fbeef 100644 --- a/.flake8 +++ b/.flake8 @@ -48,6 +48,7 @@ per-file-ignores = torch/__init__.py: F401,TOR901 torch/_custom_op/impl.py: TOR901 torch/_export/serde/upgrade.py: TOR901 + torch/_functorch/predispatch.py: TOR901 torch/_functorch/vmap.py: TOR901 torch/_inductor/test_operators.py: TOR901 torch/_library/abstract_impl.py: TOR901 diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 647671e8c83d2..798dee312306d 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -12,7 +12,9 @@ self-hosted-runner: - linux.9xlarge.ephemeral - am2.linux.9xlarge.ephemeral - linux.12xlarge + - linux.12xlarge.memory - linux.24xlarge + - linux.24xlarge.memory - linux.24xlarge.ephemeral - linux.24xlarge.amd - linux.arm64.2xlarge @@ -54,6 +56,7 @@ self-hosted-runner: - linux.rocm.gpu.2 - linux.rocm.gpu.4 # gfx942 runners + - linux.rocm.gpu.gfx942.1 - linux.rocm.gpu.gfx942.2 - linux.rocm.gpu.gfx942.4 - rocm-docker diff --git a/.github/actions/build-external-packages/action.yml b/.github/actions/build-external-packages/action.yml new file mode 100644 index 0000000000000..c0c727d93ac66 --- /dev/null +++ b/.github/actions/build-external-packages/action.yml @@ -0,0 +1,86 @@ +# .github/workflows/build-external.yml +name: Build External packages + +description: build external packages for PyTorch + +inputs: + cuda-version: + description: CUDA version to use + type: string + required: true + default: '12.8.1' + cuda-arch-list: + description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0") + type: string + required: true + default: "" + docker-image: + description: Base image to use + type: string + required: true + build-targets: + description: Build targets + type: string + required: true + torch-wheel-dir: + description: Directory to built torch wheel + type: string + required: false + default: dist + output-dir: + description: Directory to store build artifact + default: external + type: string + required: false + +outputs: + build_time: + description: "Total build time in seconds" + value: ${{ steps.build-external.outputs.build_time }} + output_dir: + description: "Directory where build artifact is stored" + value: ${{ steps.build-external.outputs.output_dir }} + +runs: + using: composite + steps: + - name: Build external packages in sequence + id: build-external + env: + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SCCACHE_REGION: us-east-1 + CUDA_VERSION: ${{ inputs.cuda-version }} + TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }} + BASE_IMAGE: ${{ inputs.docker-image }} + BUILD_TARGETS: ${{ inputs.build-targets }} + PARENT_OUTPUT_DIR: ${{ inputs.output-dir }} + TORCH_WHEELS_PATH: ${{ inputs.torch-wheel-dir }} + shell: bash + run: | + set -euo pipefail + python3 --version + docker images + START_TIME=$(date +%s) + ( + cd .ci/lumen_cli + python3 -m pip install -e . + ) + MAX_JOBS="$(nproc --ignore=6)" + export MAX_JOBS + + # Split the comma-separated list and build each target + IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS" + for target in "${TARGETS[@]}"; do + OUTPUT_DIR="$PARENT_OUTPUT_DIR/$target" + export OUTPUT_DIR + echo "Building external package: $target in directory $OUTPUT_DIR" + python3 -m cli.run build external "$target" + done + + END_TIME=$(date +%s) + { + echo "build_time=$((END_TIME - START_TIME))" + if [ -d "$PARENT_OUTPUT_DIR" ]; then + echo "output_dir=$PARENT_OUTPUT_DIR" + fi + } >> "$GITHUB_OUTPUT" diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml index 055404c69474d..15f193ef3a5dc 100644 --- a/.github/actions/checkout-pytorch/action.yml +++ b/.github/actions/checkout-pytorch/action.yml @@ -57,6 +57,21 @@ runs: submodules: ${{ inputs.submodules }} show-progress: false + - name: Clean submodules post checkout + id: clean-submodules + if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} + shell: bash + env: + NO_SUDO: ${{ inputs.no-sudo }} + run: | + cd "${GITHUB_WORKSPACE}" + # Clean stale submodule dirs + if [ -z "${NO_SUDO}" ]; then + sudo git submodule foreach --recursive git clean -ffdx + else + git submodule foreach --recursive git clean -ffdx + fi + - name: Clean workspace (try again) if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }} diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml index d3644c52fbcd8..a58db801b1cf8 100644 --- a/.github/actions/setup-rocm/action.yml +++ b/.github/actions/setup-rocm/action.yml @@ -59,11 +59,6 @@ runs: echo "$msg" exit 1 fi - if [[ $ngpu -eq 1 ]]; then - echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs" - echo "$msg" - exit 1 - fi - name: Runner diskspace health check uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml index 63acd791b85c6..d4b8be8b609a0 100644 --- a/.github/actions/test-pytorch-binary/action.yml +++ b/.github/actions/test-pytorch-binary/action.yml @@ -24,7 +24,6 @@ runs: -e PYTORCH_FINAL_PACKAGE_DIR \ -e PYTORCH_ROOT \ -e SKIP_ALL_TESTS \ - -e USE_SPLIT_BUILD \ --tty \ --detach \ -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index 70e9da5216ae2..b0255e764c594 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1 @@ -9b57c7bd5ad4db093c5bb31c802df9f04d933ac9 +27fc2493d383354a008106f22f3be232badee9a1 diff --git a/.github/ci_commit_pins/fbgemm_rocm.txt b/.github/ci_commit_pins/fbgemm_rocm.txt index db140a31f3fa4..19f5a2b2efa1a 100644 --- a/.github/ci_commit_pins/fbgemm_rocm.txt +++ b/.github/ci_commit_pins/fbgemm_rocm.txt @@ -1 +1 @@ -7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8 +08ae0af1395c8d8471f4025deb6af9aef90b342f diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt deleted file mode 100644 index efbc3ceeb2afe..0000000000000 --- a/.github/ci_commit_pins/torchbench.txt +++ /dev/null @@ -1 +0,0 @@ -e03a63be43e33596f7f0a43b0f530353785e4a59 diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt index 21863c19dec73..c9c4265b2f37f 100644 --- a/.github/ci_commit_pins/vllm.txt +++ b/.github/ci_commit_pins/vllm.txt @@ -1 +1 @@ -6a39ba85fe0f2fff9494b5eccea717c93510c230 +e10fef08838612b4560e9c72e5cb1414a5edfa13 diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index ee8531ae65100..eb335eb9d64d5 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -b6a5b82b9948b610fa4c304d0d869c82b8f17db1 +6c5478ff7c3d50dd1e3047d72ec5909bea474073 diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile.tmp_vllm new file mode 100644 index 0000000000000..2cee6ed2df19a --- /dev/null +++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm @@ -0,0 +1,427 @@ +# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo +# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing + +ARG CUDA_VERSION=12.8.1 +ARG PYTHON_VERSION=3.12 + +# BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine, +# by default, it uses the torch-nightly-base stage from this docker image +ARG BUILD_BASE_IMAGE=torch-nightly-base + +# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer, +# by default, it uses devel-ubuntu22.04 official image. +ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 + +# The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile +ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py" + + +#################### TORCH NIGHTLY BASE IMAGE #################### +# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base + +ARG CUDA_VERSION +ARG PYTHON_VERSION +ARG GET_PIP_URL + +# Install Python and other dependencies +RUN apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl wget sudo vim \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version + +# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 +# as it was causing spam when compiling the CUTLASS kernels +# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519) +RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \ + if command -v apt-get >/dev/null; then \ + if [ "$current_gcc_version" -lt 10 ]; then \ + echo "GCC version is $current_gcc_version, installing gcc-10..."; \ + apt-get update \ + && apt-get install -y gcc-10 g++-10 \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \ + && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \ + else \ + echo "GCC version is $current_gcc_version, no need to install gcc-10."; \ + fi \ + fi \ + && gcc --version && g++ --version + +# install uv for faster pip installs +RUN --mount=type=cache,target=/root/.cache/uv \ + python3 -m pip install uv==0.8.4 + +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy + +#################### TORCH NIGHTLY BASE IMAGE #################### + + +#################### BASE BUILD IMAGE #################### +# A base image for building vLLM with torch nightly or torch wheels +# prepare basic build environment +FROM ${BUILD_BASE_IMAGE} AS base +USER root + +ARG CUDA_VERSION +ARG PYTHON_VERSION + +# TODO (huydhn): Only work with PyTorch manylinux builder +ENV PATH="/opt/python/cp312-cp312/bin:${PATH}" + +# Install some system dependencies and double check python version +RUN if command -v apt-get >/dev/null; then \ + apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl wget sudo vim; \ + else \ + dnf install -y git curl wget sudo vim; \ + fi \ + && python3 --version && python3 -m pip --version + +# Workaround for https://github.com/openai/triton/issues/2507 and +# https://github.com/pytorch/pytorch/issues/107960 -- hopefully +# this won't be needed for future versions of this docker image +# or future versions of triton. +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ + +# Install uv for faster pip installs if not existed +RUN --mount=type=cache,target=/root/.cache/uv \ + if ! python3 -m uv --version >/dev/null 2>&1; then \ + python3 -m pip install uv==0.8.4; \ + fi +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy + +WORKDIR /workspace + +# install build and runtime dependencies +COPY requirements/common.txt requirements/common.txt +COPY use_existing_torch.py use_existing_torch.py +COPY pyproject.toml pyproject.toml + +# install build and runtime dependencies without stable torch version +RUN python3 use_existing_torch.py + +# default mount file as placeholder, this just avoid the mount error +# change to a different vllm folder if this does not exist anymore +ARG TORCH_WHEELS_PATH="./requirements" +ARG PINNED_TORCH_VERSION + +# Install torch, torchaudio and torchvision based on the input +# if TORCH_WHEELS_PATH is default "./requirements", it will pull thethe nightly versions using pip +# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine +RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \ + --mount=type=cache,target=/root/.cache/uv \ + if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \ + echo "[INFO] Installing torch wheels to build vllm"; \ + torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \ + vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \ + audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \ + uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \ + elif [ -n "$PINNED_TORCH_VERSION" ]; then \ + echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \ + uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ + else \ + echo "[INFO] Installing torch nightly with latest one to build vllm"; \ + uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ + fi + +# Install numba 0.61.2 for cuda environment +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system numba==0.61.2 + +# Install common dependencies from vllm common.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/common.txt + +# Must put before installing xformers, so it can install the correct version of xfomrers. +ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a' +ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list} + +ARG max_jobs=16 +ENV MAX_JOBS=${max_jobs} + +RUN echo ${TORCH_CUDA_ARCH_LIST} +RUN echo ${MAX_JOBS} +RUN pip freeze | grep -E 'ninja' + +# Build xformers with cuda and torch nightly/wheel +# following official xformers guidance: https://github.com/facebookresearch/xformers#build +# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2 +ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468 +ENV CCACHE_DIR=/root/.cache/ccache + +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/uv \ + echo 'git clone xformers...' \ + && git clone https://github.com/facebookresearch/xformers.git --recursive \ + && cd xformers \ + && git checkout ${XFORMERS_COMMIT} \ + && git submodule update --init --recursive \ + && echo 'finish git clone xformers...' \ + && rm -rf build \ + && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \ + && cd .. \ + && rm -rf xformers + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system xformers-dist/*.whl --verbose + +# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage. +# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same +RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt + +RUN cat torch_build_versions.txt +RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio' + +#################### BASE BUILD IMAGE #################### + + +#################### WHEEL BUILD IMAGE #################### +# Image used to build vllm wheel +FROM base AS build +ARG TARGETPLATFORM + +COPY . . + +RUN python3 use_existing_torch.py + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/build.txt + +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi + +# Max jobs used by Ninja to build extensions +ARG max_jobs=16 +ENV MAX_JOBS=${max_jobs} +ARG nvcc_threads=4 +ENV NVCC_THREADS=$nvcc_threads +ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' +ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} + +ARG USE_SCCACHE +ARG SCCACHE_BUCKET_NAME=vllm-build-sccache +ARG SCCACHE_REGION_NAME=us-west-2 +ARG SCCACHE_S3_NO_CREDENTIALS=0 + +# if USE_SCCACHE is set, use sccache to speed up compilation +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=.git,target=.git \ + if [ "$USE_SCCACHE" = "1" ]; then \ + echo "Installing sccache..." \ + && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ + && tar -xzf sccache.tar.gz \ + && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ + && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ + && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ + && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ + && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ + && export SCCACHE_IDLE_TIMEOUT=0 \ + && export CMAKE_BUILD_TYPE=Release \ + && export VLLM_DOCKER_BUILD_CONTEXT=1 \ + && sccache --show-stats \ + && python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38 \ + && sccache --show-stats; \ + fi + +ARG vllm_target_device="cuda" +ENV VLLM_TARGET_DEVICE=${vllm_target_device} +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=.git,target=.git \ + if [ "$USE_SCCACHE" != "1" ]; then \ + # Clean any existing CMake artifacts + rm -rf .deps && \ + mkdir -p .deps && \ + export VLLM_DOCKER_BUILD_CONTEXT=1 && \ + python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \ + fi + +RUN echo "[INFO] Listing current directory:" && \ + ls -al && \ + echo "[INFO] Showing torch_build_versions.txt content:" && \ + cat torch_build_versions.txt + +#################### WHEEL BUILD IMAGE #################### + + +################### VLLM INSTALLED IMAGE #################### +# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer +FROM ${FINAL_BASE_IMAGE} AS vllm-base +USER root + +ARG CUDA_VERSION +ARG PYTHON_VERSION +ARG GET_PIP_URL + +# TODO (huydhn): Only work with PyTorch manylinux builder +ENV PATH="/opt/python/cp312-cp312/bin:${PATH}" + +# prepare for environment starts +WORKDIR /workspace + +# Install Python and other dependencies +RUN if command -v apt-get >/dev/null; then \ + apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl wget sudo vim \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \ + else \ + dnf install -y git curl wget sudo vim; \ + fi \ + && python3 --version && python3 -m pip --version + +# Get the torch versions, and whls used in previous stagtes for consistency +COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt +COPY --from=base /workspace/xformers-dist /wheels/xformers +COPY --from=build /workspace/vllm-dist /wheels/vllm +RUN echo "[INFO] Listing current directory before torch install step:" && \ + ls -al && \ + echo "[INFO] Showing torch_build_versions.txt content:" && \ + cat torch_build_versions.txt + +# Workaround for https://github.com/openai/triton/issues/2507 and +# https://github.com/pytorch/pytorch/issues/107960 -- hopefully +# this won't be needed for future versions of this docker image +# or future versions of triton. +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ + +# Install uv for faster pip installs if not existed +RUN --mount=type=cache,target=/root/.cache/uv \ + if ! python3 -m uv --version > /dev/null 2>&1; then \ + python3 -m pip install uv==0.8.4; \ + fi +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy + +# Default mount file as placeholder, this just avoid the mount error +ARG TORCH_WHEELS_PATH="./requirements" +# Install torch, torchaudio and torchvision +# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt +# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine +RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \ + --mount=type=cache,target=/root/.cache/uv \ + if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \ + torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \ + vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \ + audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \ + echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \ + uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \ + else \ + echo "[INFO] Installing torch versions from torch_build_versions.txt"; \ + uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ + fi + +# Install the vllm wheel from previous stage +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system /wheels/vllm/*.whl --verbose + +# Install xformers wheel from previous stage +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system /wheels/xformers/*.whl --verbose + +# Build flashinfer from source. +ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0' +# install package for build flashinfer +# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738 + +RUN pip install build==1.3.0 +RUN pip freeze | grep -E 'setuptools|packaging|build' + +ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} +# Build flashinfer for torch nightly from source around 10 mins +ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" +# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt +ARG FLASHINFER_GIT_REF="v0.2.14.post1" +RUN --mount=type=cache,target=/root/.cache/uv \ + git clone --depth 1 --recursive --shallow-submodules \ + --branch ${FLASHINFER_GIT_REF} \ + ${FLASHINFER_GIT_REPO} flashinfer \ + && echo "Building FlashInfer with AOT for arches: ${torch_cuda_arch_list}" \ + && cd flashinfer \ + && python3 -m flashinfer.aot \ + && python3 -m build --no-isolation --wheel --outdir ../wheels/flashinfer \ + && cd .. \ + && rm -rf flashinfer + +# install flashinfer python +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system wheels/flashinfer/*.whl --verbose + +# Logging to confirm the torch versions +RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer' +RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt +################### VLLM INSTALLED IMAGE #################### + + +#################### UNITTEST IMAGE ############################# +FROM vllm-base as test + +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy + +COPY tests/ tests/ +COPY examples examples +COPY benchmarks benchmarks +COPY ./vllm/collect_env.py . +COPY requirements/common.txt requirements/common.txt +COPY use_existing_torch.py use_existing_torch.py +COPY pyproject.toml pyproject.toml +# Install build and runtime dependencies without stable torch version +COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt + +RUN python3 use_existing_torch.py + +# install packages +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/common.txt +# enable fast downloads from hf (for testing) +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system hf_transfer +ENV HF_HUB_ENABLE_HF_TRANSFER 1 + +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -e tests/vllm_test_utils + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/nightly_torch_test.txt + +# Logging to confirm the torch versions +RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer' + +# Logging to confirm all the packages are installed +RUN pip freeze + +#################### UNITTEST IMAGE ############################# + +#################### EXPORT STAGE #################### +FROM scratch as export-wheels + +# Just copy the wheels we prepared in previous stages +COPY --from=base /workspace/xformers-dist /wheels/xformers +COPY --from=build /workspace/vllm-dist /wheels/vllm +COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt +COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000000..944d3fec35659 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,24 @@ +version: 2 +updates: + # Update to the latest transformers version with dependabot + - package-ecosystem: "pip" + directory: "/.ci/docker/ci_commit_pins" + schedule: + interval: "daily" + target-branch: "main" + allow: + - dependency-name: "transformers" + ignore: + - dependency-name: "*" + update-types: ["version-update:semver-patch"] + commit-message: + prefix: "[Dependabot] Update" + include: "scope" + labels: + - "dependencies" + - "open source" + - "python" + - "topic: not user facing" + - "module: ci" + - "module: inductor" + - "ciflow/inductor" diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index a5982b63b70fc..a0aa6921b92ba 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -22,10 +22,12 @@ ciflow_push_tags: - ciflow/rocm - ciflow/rocm-mi300 - ciflow/s390 +- ciflow/riscv64 - ciflow/slow - ciflow/trunk - ciflow/unstable - ciflow/xpu +- ciflow/vllm - ciflow/torchbench - ciflow/op-benchmark - ciflow/pull diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64 deleted file mode 100644 index b6e9a6ce9f3e5..0000000000000 --- a/.github/requirements/conda-env-macOS-ARM64 +++ /dev/null @@ -1,5 +0,0 @@ -# Not pinning certifi so that we can always get the latest certificates -certifi -pip=23.2.1 -pkg-config=0.29.2 -wheel=0.37.1 diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt index 224835188d87f..3a27cac46f71f 100644 --- a/.github/requirements/pip-requirements-macOS.txt +++ b/.github/requirements/pip-requirements-macOS.txt @@ -28,7 +28,7 @@ pyyaml==6.0.2 scipy==1.12.0 setuptools==72.1.0 sympy==1.13.3 -tlparse==0.3.30 +tlparse==0.4.0 tensorboard==2.13.0 typing-extensions==4.12.2 unittest-xml-reporting<=3.2.0,>=2.0.0 diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py index 9e86d332c5316..e541e7a86f653 100644 --- a/.github/scripts/build_triton_wheel.py +++ b/.github/scripts/build_triton_wheel.py @@ -119,6 +119,7 @@ def build_triton( ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir ) else: + check_call(["git", "fetch", "origin", commit_hash], cwd=triton_basedir) check_call(["git", "checkout", commit_hash], cwd=triton_basedir) # change built wheel name and version diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index def91d29f2bd2..4a4f8a65f684d 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -16,17 +16,17 @@ # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this -CUDA_ARCHES = ["12.6", "12.8", "12.9"] +CUDA_ARCHES = ["12.6", "12.8", "13.0"] CUDA_STABLE = "12.8" CUDA_ARCHES_FULL_VERSION = { "12.6": "12.6.3", "12.8": "12.8.1", - "12.9": "12.9.1", + "13.0": "13.0.0", } CUDA_ARCHES_CUDNN_VERSION = { "12.6": "9", "12.8": "9", - "12.9": "9", + "13.0": "9", } # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this @@ -38,7 +38,7 @@ CPU_S390X_ARCH = ["cpu-s390x"] -CUDA_AARCH64_ARCHES = ["12.9-aarch64"] +CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"] PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { @@ -54,7 +54,7 @@ "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'" @@ -71,49 +71,49 @@ "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'" ), - "12.9": ( - "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'" + "13.0": ( + "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'" ), "xpu": ( - "intel-cmplr-lib-rt==2025.1.1 | " - "intel-cmplr-lib-ur==2025.1.1 | " - "intel-cmplr-lic-rt==2025.1.1 | " - "intel-sycl-rt==2025.1.1 | " - "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "onemkl-sycl-blas==2025.1.0 | " - "onemkl-sycl-dft==2025.1.0 | " - "onemkl-sycl-lapack==2025.1.0 | " - "onemkl-sycl-rng==2025.1.0 | " - "onemkl-sycl-sparse==2025.1.0 | " - "dpcpp-cpp-rt==2025.1.1 | " - "intel-opencl-rt==2025.1.1 | " - "mkl==2025.1.0 | " - "intel-openmp==2025.1.1 | " - "tbb==2022.1.0 | " - "tcmlib==1.3.0 | " - "umf==0.10.0 | " - "intel-pti==0.12.3" + "intel-cmplr-lib-rt==2025.2.1 | " + "intel-cmplr-lib-ur==2025.2.1 | " + "intel-cmplr-lic-rt==2025.2.1 | " + "intel-sycl-rt==2025.2.1 | " + "oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "onemkl-sycl-blas==2025.2.0 | " + "onemkl-sycl-dft==2025.2.0 | " + "onemkl-sycl-lapack==2025.2.0 | " + "onemkl-sycl-rng==2025.2.0 | " + "onemkl-sycl-sparse==2025.2.0 | " + "dpcpp-cpp-rt==2025.2.1 | " + "intel-opencl-rt==2025.2.1 | " + "mkl==2025.2.0 | " + "intel-openmp==2025.2.1 | " + "tbb==2022.2.0 | " + "tcmlib==1.4.0 | " + "umf==0.11.0 | " + "intel-pti==0.13.1" ), } @@ -124,9 +124,7 @@ def get_nccl_wheel_version(arch_version: str) -> str: requirements = map( str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]) ) - return next(x for x in requirements if x.startswith("nvidia-nccl-cu")).split("==")[ - 1 - ] + return next(x for x in requirements if x.startswith("nvidia-nccl")).split("==")[1] def read_nccl_pin(arch_version: str) -> str: @@ -193,7 +191,7 @@ def arch_type(arch_version: str) -> str: "cpu": "libtorch-cxx11-builder:cpu", } -FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"] +FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"] def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str: @@ -273,7 +271,6 @@ def generate_wheels_matrix( os: str, arches: Optional[list[str]] = None, python_versions: Optional[list[str]] = None, - use_split_build: bool = False, ) -> list[dict[str, str]]: package_type = "wheel" if os == "linux" or os == "linux-aarch64" or os == "linux-s390x": @@ -312,28 +309,20 @@ def generate_wheels_matrix( else arch_version ) - # TODO: Enable python 3.13t on cpu-s390x - if gpu_arch_type == "cpu-s390x" and python_version == "3.13t": - continue - # TODO: Enable python 3.14 on non linux OSes - if os != "linux" and ( - python_version == "3.14" or python_version == "3.14t" - ): + # TODO: Enable python 3.14 for rest + if os not in [ + "linux", + "linux-aarch64", + "linux-s390x", + "macos-arm64", + "windows", + ] and (python_version == "3.14" or python_version == "3.14t"): continue - if use_split_build and ( - arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux" - ): - raise RuntimeError( - "Split build is only supported on linux with cuda 12* and cpu.\n" - f"Currently attempting to build on arch version {arch_version} and os {os}.\n" - "Please modify the matrix generation to exclude this combination." - ) - # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install if ( - arch_version in ["12.9", "12.8", "12.6"] + arch_version in ["13.0", "12.8", "12.6"] and os == "linux" or arch_version in CUDA_AARCH64_ARCHES ): @@ -344,7 +333,6 @@ def generate_wheels_matrix( "gpu_arch_type": gpu_arch_type, "gpu_arch_version": gpu_arch_version, "desired_cuda": desired_cuda, - "use_split_build": "True" if use_split_build else "False", "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split( ":" )[0], @@ -367,30 +355,6 @@ def generate_wheels_matrix( ), # include special case for aarch64 build, remove the -aarch64 postfix } ) - # Special build building to use on Colab. Python 3.11 for 12.6 CUDA - if python_version == "3.11" and arch_version == CUDA_STABLE: - ret.append( - { - "python_version": python_version, - "gpu_arch_type": gpu_arch_type, - "gpu_arch_version": gpu_arch_version, - "desired_cuda": translate_desired_cuda( - gpu_arch_type, gpu_arch_version - ), - "use_split_build": "True" if use_split_build else "False", - "container_image": WHEEL_CONTAINER_IMAGES[ - arch_version - ].split(":")[0], - "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[ - arch_version - ].split(":")[1], - "package_type": package_type, - "pytorch_extra_install_requirements": "", - "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950 - ".", "_" - ), - } - ) else: ret.append( { @@ -400,7 +364,6 @@ def generate_wheels_matrix( "desired_cuda": translate_desired_cuda( gpu_arch_type, gpu_arch_version ), - "use_split_build": "True" if use_split_build else "False", "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split( ":" )[0], @@ -422,6 +385,6 @@ def generate_wheels_matrix( return ret -validate_nccl_dep_consistency("12.9") +validate_nccl_dep_consistency("13.0") validate_nccl_dep_consistency("12.8") validate_nccl_dep_consistency("12.6") diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 4df6150f97655..67906d4ad88d5 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -59,9 +59,7 @@ class BinaryBuildWorkflow: is_scheduled: str = "" branches: str = "nightly" # Mainly for macos - cross_compile_arm64: bool = False macos_runner: str = "macos-14-xlarge" - use_split_build: bool = False # Mainly used for libtorch builds build_variant: str = "" @@ -72,9 +70,6 @@ def __post_init__(self) -> None: for item in [self.os, "binary", self.package_type, self.build_variant] if item != "" ) - if self.use_split_build: - # added to distinguish concurrency groups - self.build_environment += "-split" def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: output_file_path = ( @@ -117,21 +112,6 @@ class OperatingSystem: isolated_workflow=True, ), ), - # See https://github.com/pytorch/pytorch/issues/138750 - # BinaryBuildWorkflow( - # os=OperatingSystem.LINUX, - # package_type="manywheel", - # build_configs=generate_binary_build_matrix.generate_wheels_matrix( - # OperatingSystem.LINUX, - # use_split_build=True, - # arches=["11.8", "12.1", "12.4", "cpu"], - # ), - # ciflow_config=CIFlowConfig( - # labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, - # isolated_workflow=True, - # ), - # use_split_build=True, - # ), BinaryBuildWorkflow( os=OperatingSystem.LINUX, package_type="libtorch", @@ -175,27 +155,11 @@ class OperatingSystem: package_type="manywheel", build_configs=generate_binary_build_matrix.generate_wheels_matrix( OperatingSystem.LINUX, - arches=["12.6", "12.8", "12.9"], - python_versions=["3.9"], + arches=["12.8"], + python_versions=["3.12"], ), branches="main", ), - # See https://github.com/pytorch/pytorch/issues/138750 - # BinaryBuildWorkflow( - # os=OperatingSystem.LINUX, - # package_type="manywheel", - # build_configs=generate_binary_build_matrix.generate_wheels_matrix( - # OperatingSystem.LINUX, - # arches=["11.8", "12.1", "12.4"], - # python_versions=["3.9"], - # use_split_build=True, - # ), - # ciflow_config=CIFlowConfig( - # labels={LABEL_CIFLOW_PERIODIC}, - # ), - # branches="main", - # use_split_build=True, - # ), BinaryBuildWorkflow( os=OperatingSystem.LINUX, package_type="libtorch", @@ -338,7 +302,6 @@ class OperatingSystem: generate_binary_build_matrix.RELEASE, libtorch_variants=["shared-with-deps"], ), - cross_compile_arm64=False, macos_runner="macos-14-xlarge", ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, @@ -351,7 +314,6 @@ class OperatingSystem: build_configs=generate_binary_build_matrix.generate_wheels_matrix( OperatingSystem.MACOS_ARM64 ), - cross_compile_arm64=False, macos_runner="macos-14-xlarge", ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, diff --git a/.github/scripts/gql_mocks.json.gz b/.github/scripts/gql_mocks.json.gz index 07628227a18a8..67355239dc422 100644 Binary files a/.github/scripts/gql_mocks.json.gz and b/.github/scripts/gql_mocks.json.gz differ diff --git a/.github/scripts/runner_determinator.py b/.github/scripts/runner_determinator.py index 1481459d40c4c..baf560234549b 100644 --- a/.github/scripts/runner_determinator.py +++ b/.github/scripts/runner_determinator.py @@ -262,7 +262,12 @@ def is_exception_branch(branch: str) -> bool: """ Branches that get opted out of experiments by default, until they're explicitly enabled. """ - return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"} + return branch.split("/", maxsplit=1)[0] in { + "main", + "nightly", + "release", + "landchecks", + } def load_yaml(yaml_text: str) -> Any: diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py index e4a8cb2bc8df1..ac3a1cc12921c 100755 --- a/.github/scripts/test_trymerge.py +++ b/.github/scripts/test_trymerge.py @@ -27,6 +27,7 @@ get_drci_classifications, gh_get_team_members, GitHubPR, + iter_issue_timeline_until_comment, JobCheckState, main as trymerge_main, MandatoryChecksMissingError, @@ -34,6 +35,8 @@ RE_GHSTACK_DESC, read_merge_rules, remove_job_name_suffix, + sha_from_committed_event, + sha_from_force_push_after, validate_revert, ) @@ -70,6 +73,9 @@ def save_mocked_queries(obj: Any) -> None: if key in mocked_queries: return mocked_queries[key] + # TODO: Remove me once https://github.com/pytorch/pytorch/issues/160489 is resolved + raise ValueError(f"Key {key} could not be found in gql_mocks") + try: rc = fallback_function(*args) except HTTPError as err: @@ -121,7 +127,7 @@ def __init__(self) -> None: self.force = force self.pr_num = 76123 self.dry_run = True - self.comment_id = 0 + self.comment_id = 12345 # Set to non-zero value self.reason = "this is for testing" self.ignore_current = False self.check_mergeability = False @@ -149,9 +155,9 @@ def mock_revert( def mock_merge( pr: GitHubPR, repo: GitRepo, + comment_id: int, dry_run: bool = False, skip_mandatory_checks: bool = False, - comment_id: Optional[int] = None, timeout_minutes: int = 400, stale_pr_days: int = 3, ignore_current: bool = False, @@ -467,9 +473,9 @@ def test_main_force( mock_merge.assert_called_once_with( mock.ANY, mock.ANY, + comment_id=mock.ANY, dry_run=mock.ANY, skip_mandatory_checks=True, - comment_id=mock.ANY, ignore_current=False, ) @@ -482,9 +488,9 @@ def test_main_merge(self, mock_merge: Any, *args: Any) -> None: mock_merge.assert_called_once_with( mock.ANY, mock.ANY, + comment_id=mock.ANY, dry_run=mock.ANY, skip_mandatory_checks=False, - comment_id=mock.ANY, ignore_current=False, ) @@ -1135,5 +1141,176 @@ def test__revlist_to_prs_two_prs( ) +@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql) +@mock.patch("trymerge.gh_fetch_merge_base", return_value="") +@mock.patch( + "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications +) +class TestTimelineFunctions(TestCase): + """Tests for the new timeline-related functions""" + + def test_sha_from_committed_event(self, *args: Any) -> None: + """Test extracting SHA from committed event""" + # Based on actual GitHub API format - committed events have "sha" at top level + event = { + "event": "committed", + "sha": "fb21ce932ded6670c918804a0d9151b773770a7c", + } + self.assertEqual( + sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c" + ) + + # Test with missing SHA + event_no_sha = {"event": "committed"} + self.assertIsNone(sha_from_committed_event(event_no_sha)) + + def test_sha_from_force_push_after(self, *args: Any) -> None: + """Test extracting SHA from force push event""" + # NOTE: The current function doesn't handle the actual GitHub API format + # Real force push events have "commit_id" at top level, but this function + # looks for "after", "after_commit", "after_sha", or "head_sha" fields + + # Test with the legacy format the current function handles + event_legacy = { + "event": "head_ref_force_pushed", + "after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"}, + } + self.assertEqual( + sha_from_force_push_after(event_legacy), + "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e", + ) + + # Test with current GitHub API format (should return None with current implementation) + event_real_api = { + "event": "head_ref_force_pushed", + "commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e", + } + self.assertEqual( + sha_from_force_push_after(event_real_api), + "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e", + ) # Current function doesn't handle commit_id + + # Test with missing SHA + event_no_sha = {"event": "head_ref_force_pushed"} + self.assertIsNone(sha_from_force_push_after(event_no_sha)) + + @mock.patch("trymerge.gh_fetch_json_list") + def test_iter_issue_timeline_until_comment( + self, mock_gh_fetch_json_list: Any, *args: Any + ) -> None: + """Test timeline iteration until target comment""" + # Mock timeline data based on actual GitHub API format + timeline_data = [ + {"event": "commented", "id": 100, "body": "first comment"}, + {"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"}, + {"event": "commented", "id": 200, "body": "target comment"}, + {"event": "commented", "id": 300, "body": "after target"}, + ] + mock_gh_fetch_json_list.return_value = timeline_data + + # Test iteration stops at target comment + events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200)) + self.assertEqual(len(events), 3) # Should stop at target comment + self.assertEqual(events[0]["event"], "commented") + self.assertEqual(events[0]["id"], 100) + self.assertEqual(events[1]["event"], "committed") + self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c") + self.assertEqual(events[2]["event"], "commented") + self.assertEqual(events[2]["id"], 200) + + @mock.patch("trymerge.gh_fetch_json_list") + def test_iter_issue_timeline_until_comment_not_found( + self, mock_gh_fetch_json_list: Any, *args: Any + ) -> None: + """Test timeline iteration when target comment is not found""" + # Mock empty timeline + mock_gh_fetch_json_list.return_value = [] + + events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999)) + self.assertEqual(len(events), 0) + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_commit_after_comment( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + """Test get_commit_sha_at_comment returns correct SHA after comment""" + mock_iter_timeline.return_value = [ + {"event": "committed", "sha": "commit1"}, + {"event": "committed", "sha": "commit2"}, + {"event": "commented", "id": 100}, + {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertEqual(sha, "commit2") + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_force_push_before_comment( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.return_value = [ + {"event": "committed", "sha": "commit1"}, + {"event": "committed", "sha": "commit2"}, + {"event": "head_ref_force_pushed", "commit_id": "commit3"}, + {"event": "commented", "id": 100}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertEqual(sha, "commit3") + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.return_value = [ + {"event": "committed", "sha": "commit1"}, + {"event": "committed", "sha": "commit2"}, + {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}}, + {"event": "commented", "id": 100}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertEqual(sha, "commit3") + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_multiple_comments( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.return_value = [ + {"event": "committed", "sha": "commit1"}, + {"event": "commented", "id": 100}, + {"event": "committed", "sha": "commit2"}, + {"event": "commented", "id": 200}, + {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}}, + {"event": "commented", "id": 300}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(200) + self.assertEqual(sha, "commit2") + sha = pr.get_commit_sha_at_comment(300) + self.assertEqual(sha, "commit3") + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_no_events( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.return_value = [ + {"event": "commented", "id": 100}, + {"event": "labeled", "label": {"name": "test"}}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertIsNone(sha) + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_exception( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.side_effect = Exception("API error") + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertIsNone(sha) + + if __name__ == "__main__": main() diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index 9db85ee00ebea..00b66869dcf2a 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -108,10 +108,6 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]): fragment PRCheckSuites on CheckSuiteConnection { edges { node { - app { - name - databaseId - } workflowRun { workflow { name @@ -454,6 +450,63 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]): IGNORABLE_FAILED_CHECKS_THESHOLD = 10 +def iter_issue_timeline_until_comment( + org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200 +) -> Any: + """ + Yield timeline entries in order until (and including) the entry whose id == target_comment_id + for a 'commented' event. Stops once the target comment is encountered. + """ + page = 1 + + while page <= max_pages: + url = ( + f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline" + ) + params = {"per_page": 100, "page": page} + + batch = gh_fetch_json_list(url, params) + + if not batch: + return + for ev in batch: + # The target is the issue comment row with event == "commented" and id == issue_comment_id + if ev.get("event") == "commented" and ev.get("id") == target_comment_id: + yield ev # nothing in the timeline after this matters, so stop early + return + yield ev + if len(batch) < 100: + return + page += 1 + + # If we got here without finding the comment, then we either hit a bug or some github PR + # has a _really_ long timeline. + # The max # of pages found on any pytorch/pytorch PR at the time of this change was 41 + raise RuntimeError( + f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}." + f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team." + ) + + +def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]: + """Extract SHA from committed event in timeline""" + return ev.get("sha") + + +def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]: + """Extract SHA from force push event in timeline""" + # The current GitHub API format + commit_id = ev.get("commit_id") + if commit_id: + return str(commit_id) + + # Legacy format + after = ev.get("after") or ev.get("after_commit") or {} + if isinstance(after, dict): + return after.get("sha") or after.get("oid") + return ev.get("after_sha") or ev.get("head_sha") + + def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any: rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no) return rc["data"]["repository"]["pullRequest"] @@ -741,16 +794,24 @@ def get_changed_files_count(self) -> int: def last_commit(self) -> Any: return self.info["commits"]["nodes"][-1]["commit"] + def last_commit_sha(self, default: Optional[str] = None) -> str: + # for commits, the oid is the sha + + if default is None: + return str(self.last_commit()["oid"]) + + return str(self.last_commit().get("oid", default)) + def get_merge_base(self) -> str: if self.merge_base: return self.merge_base - last_commit_oid = self.last_commit()["oid"] + last_commit_sha = self.last_commit_sha() # NB: We could use self.base_ref() here for regular PR, however, that doesn't # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base, # so let's just use main instead self.merge_base = gh_fetch_merge_base( - self.org, self.project, last_commit_oid, self.default_branch() + self.org, self.project, last_commit_sha, self.default_branch() ) # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid @@ -839,6 +900,44 @@ def get_approved_by(self) -> list[str]: def get_commit_count(self) -> int: return int(self.info["commits_with_authors"]["totalCount"]) + def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]: + """ + Get the PR head commit SHA that was present when a specific comment was posted. + This ensures we only merge the state of the PR at the time the merge command was issued, + not any subsequent commits that may have been pushed after. + + Returns None if no head-changing events found before the comment or if the comment was not found. + """ + head = None + + try: + for event in iter_issue_timeline_until_comment( + self.org, self.project, self.pr_num, comment_id + ): + etype = event.get("event") + if etype == "committed": + sha = sha_from_committed_event(event) + if sha: + head = sha + print(f"Timeline: Found commit event for SHA {sha}") + elif etype == "head_ref_force_pushed": + sha = sha_from_force_push_after(event) + if sha: + head = sha + print(f"Timeline: Found force push event for SHA {sha}") + elif etype == "commented": + if event.get("id") == comment_id: + print(f"Timeline: Found final comment with sha {sha}") + return head + except Exception as e: + print( + f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}" + ) + return None + + print(f"Did not find comment with id {comment_id} in the PR timeline") + return None + def get_pr_creator_login(self) -> str: return cast(str, self.info["author"]["login"]) @@ -1155,7 +1254,7 @@ def merge_into( *, skip_mandatory_checks: bool = False, dry_run: bool = False, - comment_id: Optional[int] = None, + comment_id: int, ignore_current_checks: Optional[list[str]] = None, ) -> None: # Raises exception if matching rule is not found @@ -1171,7 +1270,7 @@ def merge_into( skip_internal_checks=can_skip_internal_checks(self, comment_id), ignore_current_checks=ignore_current_checks, ) - additional_merged_prs = self.merge_changes( + additional_merged_prs = self.merge_changes_locally( repo, skip_mandatory_checks, comment_id ) @@ -1200,7 +1299,7 @@ def merge_into( broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []), flaky_checks=ignorable_checks.get("FLAKY", []), unstable_checks=ignorable_checks.get("UNSTABLE", []), - last_commit_sha=self.last_commit().get("oid", ""), + last_commit_sha=self.last_commit_sha(default=""), merge_base_sha=self.get_merge_base(), merge_commit_sha=merge_commit_sha, is_failed=False, @@ -1221,7 +1320,7 @@ def merge_into( dry_run=dry_run, ) - def merge_changes( + def merge_changes_locally( self, repo: GitRepo, skip_mandatory_checks: bool = False, @@ -1230,27 +1329,15 @@ def merge_changes( skip_all_rule_checks: bool = False, ) -> list["GitHubPR"]: """ - :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally + :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally """ branch_to_merge_into = self.default_branch() if branch is None else branch if repo.current_branch() != branch_to_merge_into: repo.checkout(branch_to_merge_into) - if not self.is_ghstack_pr(): - msg = self.gen_commit_message() - pr_branch_name = f"__pull-request-{self.pr_num}__init__" - repo.fetch(self.last_commit()["oid"], pr_branch_name) - repo._run_git("merge", "--squash", pr_branch_name) - repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg) - - # Did the PR change since we started the merge? - pulled_sha = repo.show_ref(pr_branch_name) - latest_pr_status = GitHubPR(self.org, self.project, self.pr_num) - if pulled_sha != latest_pr_status.last_commit()["oid"]: - raise RuntimeError( - "PR has been updated since CI checks last passed. Please rerun the merge command." - ) - return [] - else: + + # It's okay to skip the commit SHA check for ghstack PRs since + # authoring requires write access to the repo. + if self.is_ghstack_pr(): return self.merge_ghstack_into( repo, skip_mandatory_checks, @@ -1258,6 +1345,48 @@ def merge_changes( skip_all_rule_checks=skip_all_rule_checks, ) + msg = self.gen_commit_message() + pr_branch_name = f"__pull-request-{self.pr_num}__init__" + + # Determine which commit SHA to merge + commit_to_merge = None + if not comment_id: + raise ValueError("Must provide --comment-id when merging regular PRs") + + # Get the commit SHA that was present when the comment was made + commit_to_merge = self.get_commit_sha_at_comment(comment_id) + if not commit_to_merge: + raise RuntimeError( + f"Could not find commit that was pushed before comment {comment_id}" + ) + + # Validate that this commit is the latest commit on the PR + latest_commit = self.last_commit_sha() + if commit_to_merge != latest_commit: + raise RuntimeError( + f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted " + f"but now the latest commit on the PR is {latest_commit}. " + f"Please re-issue the merge command to merge the latest commit." + ) + + print(f"Merging commit {commit_to_merge} locally") + + repo.fetch(commit_to_merge, pr_branch_name) + repo._run_git("merge", "--squash", pr_branch_name) + repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg) + + # Did the PR change since we started the merge? + pulled_sha = repo.show_ref(pr_branch_name) + latest_pr_status = GitHubPR(self.org, self.project, self.pr_num) + if ( + pulled_sha != latest_pr_status.last_commit_sha() + or pulled_sha != commit_to_merge + ): + raise RuntimeError( + "PR has been updated since CI checks last passed. Please rerun the merge command." + ) + return [] + class MergeRuleFailedError(RuntimeError): def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None: @@ -1462,7 +1591,7 @@ def find_matching_merge_rule( pending_checks = [] failed_checks = [] - hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}" + hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}" if len(failed_checks) > 0: if reject_reason_score < 30000: reject_reason_score = 30000 @@ -2160,14 +2289,14 @@ def categorize_checks( def merge( pr: GitHubPR, repo: GitRepo, + comment_id: int, dry_run: bool = False, skip_mandatory_checks: bool = False, - comment_id: Optional[int] = None, timeout_minutes: int = 400, stale_pr_days: int = 3, ignore_current: bool = False, ) -> None: - initial_commit_sha = pr.last_commit()["oid"] + initial_commit_sha = pr.last_commit_sha() pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}" print(f"Attempting merge of {initial_commit_sha} ({pr_link})") @@ -2238,7 +2367,7 @@ def merge( f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)" ) pr = GitHubPR(pr.org, pr.project, pr.pr_num) - if initial_commit_sha != pr.last_commit()["oid"]: + if initial_commit_sha != pr.last_commit_sha(): raise RuntimeError( "New commits were pushed while merging. Please rerun the merge command." ) @@ -2405,7 +2534,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: if args.check_mergeability: if pr.is_ghstack_pr(): get_ghstack_prs(repo, pr) # raises error if out of sync - pr.merge_changes( + pr.merge_changes_locally( repo, skip_mandatory_checks=True, skip_all_rule_checks=True, @@ -2420,12 +2549,18 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run) return try: + # Ensure comment id is set, else fail + if not args.comment_id: + raise ValueError( + "Comment ID is required for merging PRs, please provide it using --comment-id" + ) + merge( pr, repo, + comment_id=args.comment_id, dry_run=args.dry_run, skip_mandatory_checks=args.force, - comment_id=args.comment_id, ignore_current=args.ignore_current, ) except Exception as e: @@ -2447,7 +2582,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: broken_trunk_checks=[], flaky_checks=[], unstable_checks=[], - last_commit_sha=pr.last_commit().get("oid", ""), + last_commit_sha=pr.last_commit_sha(default=""), merge_base_sha=pr.get_merge_base(), is_failed=True, skip_mandatory_checks=args.force, diff --git a/.github/scripts/windows/build_magma.bat b/.github/scripts/windows/build_magma.bat index 0f11fe34068eb..75c916ecdbef7 100644 --- a/.github/scripts/windows/build_magma.bat +++ b/.github/scripts/windows/build_magma.bat @@ -35,6 +35,9 @@ cd magma mkdir build && cd build set GPU_TARGET=All +if "%CUVER_NODOT%" == "130" ( + set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 +) if "%CUVER_NODOT%" == "129" ( set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 ) diff --git a/.github/scripts/windows/build_triton.bat b/.github/scripts/windows/build_triton.bat index 97cd535a49889..d26dc8bf3b198 100644 --- a/.github/scripts/windows/build_triton.bat +++ b/.github/scripts/windows/build_triton.bat @@ -1,18 +1,12 @@ @echo on -set PYTHON_PREFIX=%PY_VERS:.=% -set PYTHON_PREFIX=py%PYTHON_PREFIX:;=;py% -call .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat -:: Create a new conda environment -if "%PY_VERS%" == "3.13t" ( - call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python-freethreading python=3.13 -) else ( - call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS% -) +set DESIRED_PYTHON=%PY_VERS% +call .ci/pytorch/windows/internal/install_python.bat + :: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480 -call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja +%PYTHON_EXEC% -m pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja==1.11.1.4 dir "%VC_INSTALL_PATH%" call "%VC_INSTALL_PATH%\VC\Auxiliary\Build\vcvarsall.bat" x64 -call conda run -n %PYTHON_PREFIX% python .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE% +%PYTHON_EXEC% .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE% diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 23d4c003efa86..064eea7592230 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -4,7 +4,7 @@ {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%} {%- set timeout_minutes = 240 -%} -{%- set timeout_minutes_windows_binary = 300 -%} +{%- set timeout_minutes_windows_binary = 360 -%} {%- macro concurrency(build_environment) -%} concurrency: diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index b14a13f3f90c2..fee9ca2eac120 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -114,12 +114,12 @@ jobs: ALPINE_IMAGE: "docker.io/s390x/alpine" {%- elif config["gpu_arch_type"] == "rocm" %} runs_on: linux.rocm.gpu - {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.8", "12.9"] %} + {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.6"] %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner {%- elif config["gpu_arch_type"] == "cuda" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner {%- else %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.4xlarge @@ -135,7 +135,7 @@ jobs: contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@main - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 29b92ad461ef4..f4b2a66d2acda 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -47,9 +47,6 @@ env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} SKIP_ALL_TESTS: 0 -{%- if cross_compile_arm64 %} - CROSS_COMPILE_ARM64: 1 -{% endif %} !{{ common.concurrency(build_environment) }} jobs: @@ -71,11 +68,6 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi !{{ common.checkout(deep_clone=False, directory="pytorch") }} - name: Populate binary env run: | @@ -113,12 +105,33 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" - if [[ $DESIRED_PYTHON == "3.13t" ]]; then - conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge - SMOKE_TEST_PARAMS="--torch-compile-check disabled" - else - conda create -yn "test_conda_env" python="$DESIRED_PYTHON" - fi + + EXTRA_CONDA_INSTALL_FLAGS="" + CONDA_ENV_CREATE_FLAGS="" + # shellcheck disable=SC2153 + case $DESIRED_PYTHON in + 3.14t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.14) + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.13t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" + desired_python="3.13" + ;; + *) + # shellcheck disable=SC2153 + desired_python=${DESIRED_PYTHON} + ;; + esac + + # shellcheck disable=SC2086 + conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} conda activate test_conda_env pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2 index f159d623f1bf7..5e3798f8e2377 100644 --- a/.github/templates/upload.yml.j2 +++ b/.github/templates/upload.yml.j2 @@ -15,7 +15,7 @@ # favor of GPU_ARCH_VERSION DESIRED_CUDA: !{{ config["desired_cuda"] }} {%- if config["gpu_arch_version"] %} - GPU_ARCH_VERSION: !{{ config["gpu_arch_version"] }} + GPU_ARCH_VERSION: "!{{ config["gpu_arch_version"] }}" {%- endif %} GPU_ARCH_TYPE: !{{ config["gpu_arch_type"] }} {%- if include_skip_tests %} @@ -25,11 +25,6 @@ DOCKER_IMAGE: !{{ config["container_image"] }} DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }} {%- endif %} -{%- if config["package_type"] == "manywheel" %} - {%- if config.use_split_build is defined %} - use_split_build: !{{ config["use_split_build"] }} - {%- endif %} -{%- endif %} {%- if config["package_type"] == "libtorch" %} {%- if config["libtorch_config"] %} LIBTORCH_CONFIG: !{{ config["libtorch_config"] }} @@ -38,7 +33,7 @@ {%- if is_windows %} # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" {%- endif %} {%- else %} diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml index f11ee4a6621e1..bfa035bc753b8 100644 --- a/.github/workflows/_binary-build-linux.yml +++ b/.github/workflows/_binary-build-linux.yml @@ -26,13 +26,6 @@ on: default: 240 type: number description: timeout for the job - use_split_build: - description: | - [Experimental] Build a libtorch only wheel and build pytorch such that - are built from the libtorch wheel. - required: false - type: boolean - default: false ALPINE_IMAGE: required: false type: string @@ -117,7 +110,6 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} PYTORCH_FINAL_PACKAGE_DIR: /artifacts SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - USE_SPLIT_BUILD: ${{ inputs.use_split_build }} steps: - name: Make the env permanent during this workflow (but not the secrets) shell: bash @@ -142,7 +134,6 @@ jobs: echo "PR_NUMBER=${{ env.PR_NUMBER }}" echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" echo "SHA1=${{ env.SHA1 }}" - echo "USE_SPLIT_BUILD=${{ env.use_split_build }}" } >> "${GITHUB_ENV} }}" - name: List the env @@ -261,7 +252,6 @@ jobs: -e PYTORCH_ROOT \ -e SKIP_ALL_TESTS \ -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \ - -e USE_SPLIT_BUILD \ --tty \ --detach \ -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml index 434167d0f0c6d..2d9e4d0e27b25 100644 --- a/.github/workflows/_binary-test-linux.yml +++ b/.github/workflows/_binary-test-linux.yml @@ -64,13 +64,6 @@ on: required: true type: string description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu - use_split_build: - description: | - [Experimental] Build a libtorch only wheel and build pytorch such that - are built from the libtorch wheel. - required: false - type: boolean - default: false secrets: github-token: required: true @@ -104,7 +97,6 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} PYTORCH_FINAL_PACKAGE_DIR: /artifacts SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - USE_SPLIT_BUILD: ${{ inputs.use_split_build }} steps: - name: Make the env permanent during this workflow (but not the secrets) shell: bash @@ -129,7 +121,6 @@ jobs: echo "PR_NUMBER=${{ env.PR_NUMBER }}" echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" echo "SHA1=${{ env.SHA1 }}" - echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}" } >> "${GITHUB_ENV} }}" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" @@ -196,6 +187,8 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG uses: pytorch/test-infra/.github/actions/setup-nvidia@main + with: + driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }} if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }} - name: configure aws credentials diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml index 6750102b5a293..636b76d42931a 100644 --- a/.github/workflows/_binary-upload.yml +++ b/.github/workflows/_binary-upload.yml @@ -51,13 +51,6 @@ on: required: false type: string description: Desired python version - use_split_build: - description: | - [Experimental] Build a libtorch only wheel and build pytorch such that - are built from the libtorch wheel. - required: false - type: boolean - default: false secrets: github-token: required: true @@ -86,7 +79,6 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} PYTORCH_FINAL_PACKAGE_DIR: /artifacts SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - USE_SPLIT_BUILD: ${{ inputs.use_split_build }} steps: - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml index efe92ca627bba..014e6106b0730 100644 --- a/.github/workflows/_link_check.yml +++ b/.github/workflows/_link_check.yml @@ -13,6 +13,7 @@ jobs: if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }} uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: + job-name: lint-urls timeout: 120 runner: ${{ inputs.runner }}linux.2xlarge docker-image: ci-image:pytorch-linux-jammy-linter @@ -38,6 +39,7 @@ jobs: if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }} uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: + job-name: lint-xrefs timeout: 60 runner: ${{ inputs.runner }}linux.2xlarge docker-image: ci-image:pytorch-linux-jammy-linter diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index 5173425009f69..6b4bd429e3c9f 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -96,6 +96,13 @@ on: required: false type: string default: "" + build-external-packages: + description: | + If set, the build external packages and saves their wheels as artifacts + use command separated list of packages to build ex: 'vllm,transformers'. + required: false + type: string + default: "" secrets: HUGGING_FACE_HUB_TOKEN: @@ -121,7 +128,7 @@ jobs: # Don't run on forked repos if: github.repository_owner == 'pytorch' runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }} - timeout-minutes: 240 + timeout-minutes: 480 outputs: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} test-matrix: ${{ steps.filter.outputs.test-matrix }} @@ -262,6 +269,7 @@ jobs: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} BUILD_ADDITIONAL_PACKAGES: ${{ inputs.build-additional-packages }} + RUNNER: ${{ inputs.runner }} run: | START_TIME=$(date +%s) if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then @@ -287,10 +295,36 @@ jobs: # comes from https://github.com/pytorch/test-infra/pull/6058 TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) + if [[ ${BUILD_ENVIRONMENT} == *"riscv64"* ]]; then + # EC2 specific setup for RISC-V emulation + # Ensure binfmt_misc is available + echo "Mounting binfmt_misc filesystem" + sudo mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc 2>/dev/null || true + + echo "QEMU registration: multiarch/qemu-user-static" + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes || true + + # Final verification + echo "Checking binfmt_misc status:" + ls -la /proc/sys/fs/binfmt_misc/ 2>/dev/null || echo "Cannot access binfmt_misc directory" + + if [ -f /proc/sys/fs/binfmt_misc/qemu-riscv64 ]; then + echo "qemu-riscv64 registration successful" + else + echo "qemu-riscv64 registration failed - proceeding without emulation" + echo "This may cause RISC-V builds to fail" + fi + + RISCV_DOCKER_ARGS="--privileged" + else + RISCV_DOCKER_ARGS= + fi + # detached container should get cleaned up by teardown_ec2_linux # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty # shellcheck disable=SC2086 container_name=$(docker run \ + ${RISCV_DOCKER_ARGS} \ -e BUILD_ENVIRONMENT \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e PR_NUMBER \ @@ -306,8 +340,8 @@ jobs: -e OUR_GITHUB_JOB_ID \ -e HUGGING_FACE_HUB_TOKEN \ -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ - -e USE_SPLIT_BUILD \ -e BUILD_ADDITIONAL_PACKAGES \ + -e RUNNER \ --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ @@ -331,6 +365,26 @@ jobs: END_TIME=$(date +%s) echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT" + - name: Build external packages + id: build-external-packages + if: inputs.build-external-packages != '' && steps.build.outcome != 'skipped' + uses: ./.github/actions/build-external-packages + with: + build-targets: ${{ inputs.build-external-packages }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + cuda-arch-list: ${{ inputs.cuda-arch-list }} + output-dir: external + + - name: Move external packages to dist + if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped' + shell: bash + run: | + src="${{ steps.build-external-packages.outputs.output_dir }}" + if [ -d "$src" ]; then + mkdir -p "dist/$(dirname "$src")" + mv "$src" "dist/$(dirname "$src")/" + fi + - name: Stop monitoring script if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }} shell: bash diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 07be3720b2bf2..66579b573a63d 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -72,6 +72,10 @@ on: required: false description: | HF Auth token to avoid rate limits when downloading models or datasets from hub + VLLM_TEST_HUGGING_FACE_TOKEN: + required: false + description: | + HF Auth token to test vllm SCRIBE_GRAPHQL_ACCESS_TOKEN: required: false description: | @@ -286,6 +290,7 @@ jobs: PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} DASHBOARD_TAG: ${{ inputs.dashboard-tag }} + VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} @@ -362,6 +367,7 @@ jobs: -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e HUGGING_FACE_HUB_TOKEN \ + -e VLLM_TEST_HUGGING_FACE_TOKEN \ -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ -e DASHBOARD_TAG \ -e ARTIFACTS_FILE_SUFFIX \ @@ -403,7 +409,7 @@ jobs: job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} - name: Authenticate with AWS - if: ${{ contains(matrix.runner, 'b200') }} + if: ${{ always() && contains(matrix.runner, 'b200') }} uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 with: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 063c97e449c75..086e25b4868eb 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -136,7 +136,7 @@ jobs: MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} run: | - "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7 + "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7 "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 2d660d98905e9..f73972942b5f9 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -88,6 +88,16 @@ jobs: - name: Setup ROCm uses: ./.github/actions/setup-rocm + - name: Runner check GPU count (distributed jobs) + if: ${{ contains(matrix.config, 'distributed') }} + shell: bash + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ $ngpu -lt 4 ]]; then + echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs" + exit 1 + fi + - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml index ebfb4001e4379..7067d79eb0758 100644 --- a/.github/workflows/_win-build.yml +++ b/.github/workflows/_win-build.yml @@ -77,6 +77,7 @@ jobs: run: | git config --global core.longpaths true git config --global core.symlinks true + git config --global core.ignorecase false # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock # the directory on Windows and prevent GHA from checking out as reported diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index 0c95503928fb9..5049ef61f6930 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -70,6 +70,7 @@ jobs: run: | git config --global core.longpaths true git config --global core.symlinks true + git config --global core.ignorecase false # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock # the directory on Windows and prevent GHA from checking out as reported diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index 177e6ca4bbe3c..7aa7608924487 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -275,7 +275,7 @@ jobs: - name: Change permissions if: ${{ always() && steps.test.conclusion }} run: | - docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test" - name: Print remaining test logs shell: bash diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml index aaf85d7fc8067..0754b154a358d 100644 --- a/.github/workflows/build-almalinux-images.yml +++ b/.github/workflows/build-almalinux-images.yml @@ -36,7 +36,7 @@ jobs: runs-on: linux.9xlarge.ephemeral strategy: matrix: - tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"] + tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"] steps: - name: Build docker image uses: pytorch/pytorch/.github/actions/binary-docker-build@main diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index b2d50efd7d96c..cc2f54fc45f84 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -48,6 +48,7 @@ jobs: fail-fast: false matrix: include: [ + { tag: "cuda13.0" }, { tag: "cuda12.9" }, { tag: "cuda12.8" }, { tag: "cuda12.6" }, diff --git a/.github/workflows/build-magma-linux.yml b/.github/workflows/build-magma-linux.yml index e13de48b2408a..be8f613169e8c 100644 --- a/.github/workflows/build-magma-linux.yml +++ b/.github/workflows/build-magma-linux.yml @@ -34,7 +34,7 @@ jobs: id-token: write strategy: matrix: - cuda_version: ["129", "128", "126"] + cuda_version: ["130", "129", "128", "126"] steps: - name: Checkout PyTorch uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/build-magma-windows.yml b/.github/workflows/build-magma-windows.yml index 80d870f419e42..b7d293a5cec11 100644 --- a/.github/workflows/build-magma-windows.yml +++ b/.github/workflows/build-magma-windows.yml @@ -22,7 +22,7 @@ jobs: runs-on: windows-2022 strategy: matrix: - cuda_version: ["129", "128", "126"] + cuda_version: ["130", "129", "128", "126"] config: ["Release", "Debug"] env: CUDA_VERSION: ${{ matrix.cuda_version }} diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index e84b84f6158ba..9d08501f51bc5 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -46,11 +46,12 @@ jobs: fail-fast: false matrix: include: [ - { name: "manylinux2_28-builder", tag: "cuda12.9", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cuda12.8", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cuda12.6", runner: "linux.9xlarge.ephemeral" }, - { name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" }, diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index d54f459d0b43e..932d9c8863027 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -145,7 +145,7 @@ jobs: fi docker exec -t "${container_name}" yum install -y zlib-devel zip - docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel + docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel set +e docker exec -t "${container_name}" command -v pip has_pip=$? @@ -194,7 +194,7 @@ jobs: strategy: fail-fast: false matrix: - py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ] + py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] device: ["xpu"] timeout-minutes: 40 env: diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml new file mode 100644 index 0000000000000..658e02ede6fbd --- /dev/null +++ b/.github/workflows/build-vllm-wheel.yml @@ -0,0 +1,248 @@ +name: Build vLLM wheels + +on: + push: + branches: + - main + paths: + - .github/workflows/build-vllm-wheel.yml + - .github/ci_commit_pins/vllm.txt + workflow_dispatch: + pull_request: + paths: + - .github/workflows/build-vllm-wheel.yml + - .github/ci_commit_pins/vllm.txt + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + build-wheel: + if: github.repository_owner == 'pytorch' + strategy: + fail-fast: false + matrix: + python-version: [ '3.12' ] + # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554 + device: [ 'cu128', 'cu129' ] + runner: [ 'linux.12xlarge.memory' ] + include: + - device: cu128 + manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8' + - device: cu129 + manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9' + name: "Build ${{ matrix.device }} vLLM wheel" + runs-on: ${{ matrix.runner }} + timeout-minutes: 480 + env: + PY_VERS: ${{ matrix.python-version }} + MANYLINUX_IMAGE: ${{ matrix.manylinux-image }} + PLATFORM: 'manylinux_2_28_x86_64' + BUILD_DEVICE: ${{ matrix.device }} + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Get latest PyTorch nightly + shell: bash + run: | + set -eux + + # Keep PyTorch nightly wheel here so that we can install it later during + # vLLM build process + mkdir -p "${RUNNER_TEMP}/artifacts/" + + container_name=$(docker run \ + --tty \ + --detach \ + -e PLATFORM \ + -v "${GITHUB_WORKSPACE}:/pytorch" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w /artifacts/ \ + "${MANYLINUX_IMAGE}" + ) + + # Determine python executable for given version (copied from build-triton-wheel) + case $PY_VERS in + 3.10) + PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python + ;; + 3.11) + PYTHON_EXECUTABLE=/opt/python/cp311-cp311/bin/python + ;; + 3.12) + PYTHON_EXECUTABLE=/opt/python/cp312-cp312/bin/python + ;; + 3.13) + PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python + ;; + 3.13t) + PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python + ;; + 3.14) + PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python + ;; + 3.14t) + PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python + ;; + *) + echo "Unsupported python version ${PY_VERS}" + exit 1 + ;; + esac + + docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \ + --pre torch torchvision torchaudio \ + --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" + + # I wonder if there is a command to both download and install the wheels + # in one go + docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \ + --pre torch torchvision torchaudio \ + --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" + + # Save this for later + echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV" + echo "container_name=${container_name}" >> "$GITHUB_ENV" + + - name: Build vLLM wheel + uses: ./.github/actions/build-external-packages + with: + build-targets: vllm + docker-image: ${{ env.MANYLINUX_IMAGE }} + cuda-arch-list: '8.0;8.9;9.0;10.0;12.0' + torch-wheel-dir: ${{ runner.temp }}/artifacts + output-dir: ${{ runner.temp }}/artifacts/externals + + - name: Prepare vLLM wheel + shell: bash + run: | + set -eux + + # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh + docker exec -t "${container_name}" bash -c " + set -eux + + nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4) + + pushd externals/vllm/wheels + for package in xformers flashinfer-python vllm; do + pushd \$package + auditwheel repair --plat \$PLATFORM *.whl \ + --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv* + repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*) + repair_wheel=\$(basename \${repair_wheel}) + popd + + cp \${package}/wheelhouse/\${repair_wheel} . + version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) + + if [[ \$package == vllm ]]; then + new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly} + else + major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3) + new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly} + fi + + mv -- \$repair_wheel \$new_wheel + rm -rf \$package + done + popd + " + + docker exec -t "${container_name}" chown -R 1000:1000 /artifacts + + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + with: + name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }} + if-no-files-found: error + path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + + # Copied from build-triton-wheel workflow (mostly) + upload-wheel: + name: "Upload ${{ matrix.device }} vLLM wheel" + needs: + - build-wheel + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + device: [ 'cu128', 'cu129' ] + env: + BUILD_DEVICE: ${{ matrix.device }} + permissions: + id-token: write + contents: read + container: + image: continuumio/miniconda3:4.12.0 + environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }} + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Configure AWS credentials(PyTorch account) for main + if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }} + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels + aws-region: us-east-1 + + - name: Configure AWS credentials(PyTorch account) for RC builds + if: ${{ github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }} + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels + aws-region: us-east-1 + + - name: Download Build Artifacts + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + # Download all available artifacts + path: ${{ runner.temp }}/artifacts-all + + - name: Select Wheel Artifacts + shell: bash + run: | + set -eux + mkdir -p "${RUNNER_TEMP}/artifacts/" + mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/" + + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }} + shell: bash + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }} + shell: bash + run: | + set -ex + + if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + + - name: Upload binaries + env: + PACKAGE_TYPE: wheel + UPLOAD_SUBFOLDER: ${{ env.BUILD_DEVICE }} + PKG_DIR: ${{ runner.temp }}/artifacts + shell: bash + run: | + set -ex + bash .circleci/scripts/binary_upload.sh diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index db8fbcb4bdc7d..57fe7be15d298 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -57,6 +57,11 @@ jobs: echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV" - name: Checkout optional submodules run: python3 tools/optional_submodules.py + - name: Copy docs requirements for inclusion + run: | + # Replace symlink with actual file + rm docs/requirements.txt || true + cp .ci/docker/requirements-docs.txt docs/requirements.txt - name: Create source distribution run: | # Create new folder with specified name so extracting the archive yields that diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index c27f651b6b3aa..492f41775d9de 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -50,35 +50,31 @@ jobs: runner: [linux.12xlarge] docker-image-name: [ pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11, + pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11, pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm, - pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks, - pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks, - pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks, pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks, - pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks, - pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks, pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9, pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11, - pytorch-linux-jammy-py3.9-clang12, - pytorch-linux-jammy-py3.11-clang12, - pytorch-linux-jammy-py3.12-clang12, + pytorch-linux-jammy-py3.10-clang12, pytorch-linux-jammy-py3.13-clang12, pytorch-linux-jammy-rocm-n-py3, pytorch-linux-noble-rocm-n-py3, pytorch-linux-noble-rocm-alpha-py3, - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12, - pytorch-linux-jammy-py3.9-gcc11, - pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks, + pytorch-linux-jammy-rocm-n-py3-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12, + pytorch-linux-jammy-py3.10-gcc11, + pytorch-linux-jammy-py3-gcc11-inductor-benchmarks, pytorch-linux-jammy-py3.12-halide, - pytorch-linux-jammy-xpu-2025.0-py3, - pytorch-linux-jammy-xpu-2025.1-py3, + pytorch-linux-jammy-xpu-n-1-py3, + pytorch-linux-jammy-xpu-n-py3, pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-linter, pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, # Executorch pin needs update # pytorch-linux-jammy-py3-clang12-executorch, - pytorch-linux-jammy-py3.12-triton-cpu + pytorch-linux-jammy-py3.12-triton-cpu, + pytorch-linux-noble-riscv64-py3.12-gcc14 ] include: - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11 @@ -127,7 +123,7 @@ jobs: GHCR_PAT: ${{ secrets.GHCR_PAT }} with: shell: bash - timeout_minutes: 30 + timeout_minutes: 60 max_attempts: 5 retry_wait_seconds: 90 command: | diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index 8cde3006e3816..860ee21cda6a7 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -47,7 +47,7 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - manywheel-py3_9-cpu-aarch64-build: + manywheel-py3_10-cpu-aarch64-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -60,19 +60,18 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_9-cpu-aarch64 + build_name: manywheel-py3_10-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cpu-aarch64-test: # Testing + manywheel-py3_10-cpu-aarch64-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_9-cpu-aarch64-build + - manywheel-py3_10-cpu-aarch64-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -84,21 +83,20 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cpu-aarch64 + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.2xlarge ALPINE_IMAGE: "arm64v8/alpine" secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cpu-aarch64-upload: # Uploading + manywheel-py3_10-cpu-aarch64-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_9-cpu-aarch64-test + needs: manywheel-py3_10-cpu-aarch64-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel @@ -108,14 +106,13 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cpu-aarch64 + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_9-cuda-aarch64-12_9-build: + manywheel-py3_10-cuda-aarch64-12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -124,46 +121,44 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False - DESIRED_PYTHON: "3.9" + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_9-cuda-aarch64-12_9 + build_name: manywheel-py3_10-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_10-cuda-aarch64-12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_9-cuda-aarch64-12_9-build + needs: manywheel-py3_10-cuda-aarch64-12_6-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda-aarch64-12_9 + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda-aarch64-12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-cpu-aarch64-build: + manywheel-py3_10-cuda-aarch64-12_8-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -172,66 +167,44 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu-aarch64 - DOCKER_IMAGE: manylinux2_28_aarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_10-cpu-aarch64 - build_environment: linux-aarch64-binary-manywheel - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cpu-aarch64-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_10-cpu-aarch64-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu-aarch64 - DOCKER_IMAGE: manylinux2_28_aarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False - DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cpu-aarch64 + build_name: manywheel-py3_10-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.2xlarge - ALPINE_IMAGE: "arm64v8/alpine" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cpu-aarch64-upload: # Uploading + manywheel-py3_10-cuda-aarch64-12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_10-cpu-aarch64-test + needs: manywheel-py3_10-cuda-aarch64-12_8-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu-aarch64 - DOCKER_IMAGE: manylinux2_28_aarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cpu-aarch64 + build_name: manywheel-py3_10-cuda-aarch64-12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-cuda-aarch64-12_9-build: + manywheel-py3_10-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -240,41 +213,39 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_10-cuda-aarch64-12_9 + build_name: manywheel-py3_10-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_10-cuda-aarch64-13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_10-cuda-aarch64-12_9-build + needs: manywheel-py3_10-cuda-aarch64-13_0-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cuda-aarch64-12_9 + build_name: manywheel-py3_10-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -292,7 +263,6 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral @@ -316,7 +286,6 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -340,14 +309,105 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-cuda-aarch64-12_9-build: + manywheel-py3_11-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_11-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_11-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_11-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_11-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -356,41 +416,39 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_11-cuda-aarch64-12_9 + build_name: manywheel-py3_11-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_11-cuda-aarch64-13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_11-cuda-aarch64-12_9-build + needs: manywheel-py3_11-cuda-aarch64-13_0-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cuda-aarch64-12_9 + build_name: manywheel-py3_11-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -408,7 +466,6 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral @@ -432,7 +489,6 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -456,14 +512,105 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-cuda-aarch64-12_9-build: + manywheel-py3_12-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_12-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_12-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_12-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_12-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -472,41 +619,39 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_12-cuda-aarch64-12_9 + build_name: manywheel-py3_12-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_12-cuda-aarch64-13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_12-cuda-aarch64-12_9-build + needs: manywheel-py3_12-cuda-aarch64-13_0-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-cuda-aarch64-12_9 + build_name: manywheel-py3_12-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -524,7 +669,6 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral @@ -548,7 +692,6 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -572,14 +715,13 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13-cuda-aarch64-12_9-build: + manywheel-py3_13-cuda-aarch64-12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -588,46 +730,44 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_13-cuda-aarch64-12_9 + build_name: manywheel-py3_13-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_13-cuda-aarch64-12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13-cuda-aarch64-12_9-build + needs: manywheel-py3_13-cuda-aarch64-12_6-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cuda-aarch64-12_9 + build_name: manywheel-py3_13-cuda-aarch64-12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13t-cpu-aarch64-build: + manywheel-py3_13-cuda-aarch64-12_8-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -636,66 +776,44 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu-aarch64 - DOCKER_IMAGE: manylinux2_28_aarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False - DESIRED_PYTHON: "3.13t" + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_13t-cpu-aarch64 - build_environment: linux-aarch64-binary-manywheel - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cpu-aarch64-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_13t-cpu-aarch64-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu-aarch64 - DOCKER_IMAGE: manylinux2_28_aarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False - DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cpu-aarch64 + build_name: manywheel-py3_13-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.2xlarge - ALPINE_IMAGE: "arm64v8/alpine" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cpu-aarch64-upload: # Uploading + manywheel-py3_13-cuda-aarch64-12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13t-cpu-aarch64-test + needs: manywheel-py3_13-cuda-aarch64-12_8-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu-aarch64 - DOCKER_IMAGE: manylinux2_28_aarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 - use_split_build: False - DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cpu-aarch64 + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda-aarch64-12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13t-cuda-aarch64-12_9-build: + manywheel-py3_13-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -704,41 +822,648 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False - DESIRED_PYTHON: "3.13t" + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_13t-cuda-aarch64-12_9 + build_name: manywheel-py3_13-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_13-cuda-aarch64-13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13t-cuda-aarch64-12_9-build + needs: manywheel-py3_13-cuda-aarch64-13_0-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9-aarch64 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda-aarch64-13_0 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13t-cpu-aarch64-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda-aarch64-12_9 + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13t-cpu-aarch64 + build_environment: linux-aarch64-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cpu-aarch64-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13t-cpu-aarch64-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cpu-aarch64 + build_environment: linux-aarch64-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.2xlarge + ALPINE_IMAGE: "arm64v8/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cpu-aarch64-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cpu-aarch64-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cpu-aarch64 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13t-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13t-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13t-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13t-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13t-cuda-aarch64-13_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13t-cuda-aarch64-13_0 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda-aarch64-13_0-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cuda-aarch64-13_0-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda-aarch64-13_0 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14-cpu-aarch64-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + DESIRED_PYTHON: "3.14" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14-cpu-aarch64 + build_environment: linux-aarch64-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cpu-aarch64-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_14-cpu-aarch64-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cpu-aarch64 + build_environment: linux-aarch64-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.2xlarge + ALPINE_IMAGE: "arm64v8/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cpu-aarch64-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cpu-aarch64-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cpu-aarch64 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.14" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.14" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14-cuda-aarch64-13_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.14" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14-cuda-aarch64-13_0 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cuda-aarch64-13_0-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cuda-aarch64-13_0-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cuda-aarch64-13_0 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14t-cpu-aarch64-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + DESIRED_PYTHON: "3.14t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14t-cpu-aarch64 + build_environment: linux-aarch64-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cpu-aarch64-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_14t-cpu-aarch64-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cpu-aarch64 + build_environment: linux-aarch64-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.2xlarge + ALPINE_IMAGE: "arm64v8/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cpu-aarch64-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cpu-aarch64-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cpu-aarch64 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14t-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.14t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14t-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14t-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.14t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14t-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14t-cuda-aarch64-13_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.14t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14t-cuda-aarch64-13_0 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cuda-aarch64-13_0-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cuda-aarch64-13_0-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml index 9f4a8194d2874..03835a9f5f352 100644 --- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -122,7 +122,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 @@ -145,7 +145,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 @@ -154,7 +154,7 @@ jobs: build_name: libtorch-cuda12_6-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_6-shared-with-deps-release-upload: # Uploading @@ -169,7 +169,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 @@ -190,7 +190,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 @@ -213,7 +213,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 @@ -222,7 +222,7 @@ jobs: build_name: libtorch-cuda12_8-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_8-shared-with-deps-release-upload: # Uploading @@ -237,7 +237,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 @@ -248,7 +248,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - libtorch-cuda12_9-shared-with-deps-release-build: + libtorch-cuda13_0-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -257,22 +257,22 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: libtorch-cuda12_9-shared-with-deps-release + build_name: libtorch-cuda13_0-shared-with-deps-release build_environment: linux-binary-libtorch secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-cuda12_9-shared-with-deps-release-test: # Testing + libtorch-cuda13_0-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - libtorch-cuda12_9-shared-with-deps-release-build + - libtorch-cuda13_0-shared-with-deps-release-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -280,38 +280,38 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps - build_name: libtorch-cuda12_9-shared-with-deps-release + build_name: libtorch-cuda13_0-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading + libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: libtorch-cuda12_9-shared-with-deps-release-test + needs: libtorch-cuda13_0-shared-with-deps-release-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps - build_name: libtorch-cuda12_9-shared-with-deps-release + build_name: libtorch-cuda13_0-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -326,7 +326,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 @@ -350,7 +350,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: libtorch-cxx11-builder @@ -419,7 +419,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 @@ -440,7 +440,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 @@ -464,7 +464,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: libtorch-cxx11-builder @@ -533,7 +533,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml index d1e89bb6e2d85..ec08b2c78eb67 100644 --- a/.github/workflows/generated-linux-binary-manywheel-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-main.yml @@ -42,54 +42,7 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - manywheel-py3_9-cuda12_6-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-cuda12_6 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda12_6-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-cuda12_6-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda12_6 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - - manywheel-py3_9-cuda12_8-build: + manywheel-py3_12-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -99,22 +52,21 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-cuda12_8 + build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda12_8-test: # Testing + manywheel-py3_12-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_9-cuda12_8-build + - manywheel-py3_12-cuda12_8-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -123,62 +75,14 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda12_8 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - - manywheel-py3_9-cuda12_9-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-cuda12_9 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda12_9-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-cuda12_9-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda12_9 + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 464bef0e1f7db..8a581a1f21fe1 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -47,619 +47,6 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - manywheel-py3_9-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-cpu - build_environment: linux-binary-manywheel - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-cpu-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cpu - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_9-cpu-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cpu - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - - manywheel-py3_9-cuda12_6-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-cuda12_6 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda12_6-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-cuda12_6-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda12_6 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda12_6-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_9-cuda12_6-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda12_6 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - - manywheel-py3_9-cuda12_8-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-cuda12_8 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda12_8-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-cuda12_8-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda12_8 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda12_8-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_9-cuda12_8-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda12_8 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - - manywheel-py3_9-cuda12_9-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-cuda12_9 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda12_9-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-cuda12_9-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda12_9 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cuda12_9-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_9-cuda12_9-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cuda12_9 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - - manywheel-py3_9-rocm6_3-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-rocm6_3 - build_environment: linux-binary-manywheel - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-rocm6_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-rocm6_3-build - - get-label-type - runs-on: linux.rocm.gpu.mi250 - timeout-minutes: 240 - env: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 - GPU_ARCH_TYPE: rocm - SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False - DESIRED_PYTHON: "3.9" - steps: - - name: Setup ROCm - uses: ./.github/actions/setup-rocm - - uses: actions/download-artifact@v4.1.7 - name: Download Build Artifacts - with: - name: manywheel-py3_9-rocm6_3 - path: "${{ runner.temp }}/artifacts/" - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: ROCm set GPU_FLAG - run: | - echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" - - name: configure aws credentials - id: aws_creds - if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - role-duration-seconds: 18000 - - name: Calculate docker image - id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - with: - docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} - docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.3 - docker-build-dir: .ci/docker - working-directory: pytorch - - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - - name: Test Pytorch binary - uses: ./pytorch/.github/actions/test-pytorch-binary - env: - DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - - name: Teardown ROCm - uses: ./.github/actions/teardown-rocm - manywheel-py3_9-rocm6_3-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_9-rocm6_3-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-rocm6_3 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - - manywheel-py3_9-rocm6_4-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-rocm6_4 - build_environment: linux-binary-manywheel - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-rocm6_4-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-rocm6_4-build - - get-label-type - runs-on: linux.rocm.gpu.mi250 - timeout-minutes: 240 - env: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 - GPU_ARCH_TYPE: rocm - SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False - DESIRED_PYTHON: "3.9" - steps: - - name: Setup ROCm - uses: ./.github/actions/setup-rocm - - uses: actions/download-artifact@v4.1.7 - name: Download Build Artifacts - with: - name: manywheel-py3_9-rocm6_4 - path: "${{ runner.temp }}/artifacts/" - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: ROCm set GPU_FLAG - run: | - echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" - - name: configure aws credentials - id: aws_creds - if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - role-duration-seconds: 18000 - - name: Calculate docker image - id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - with: - docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} - docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.4 - docker-build-dir: .ci/docker - working-directory: pytorch - - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - - name: Test Pytorch binary - uses: ./pytorch/.github/actions/test-pytorch-binary - env: - DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - - name: Teardown ROCm - uses: ./.github/actions/teardown-rocm - manywheel-py3_9-rocm6_4-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_9-rocm6_4-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-rocm6_4 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - - manywheel-py3_9-xpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-xpu - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-xpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-xpu-build - - get-label-type - runs-on: linux.idc.xpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu - SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False - DESIRED_PYTHON: "3.9" - permissions: - id-token: write - contents: read - steps: - - name: Setup XPU - uses: ./.github/actions/setup-xpu - - name: configure aws credentials - id: aws_creds - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - - name: Login to Amazon ECR - id: login-ecr - uses: aws-actions/amazon-ecr-login@v2 - - uses: actions/download-artifact@v4.1.7 - name: Download Build Artifacts - with: - name: manywheel-py3_9-xpu - path: "${{ runner.temp }}/artifacts/" - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Calculate docker image - id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - with: - docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} - docker-image-name: manylinux2_28-builder - custom-tag-prefix: xpu - docker-build-dir: .ci/docker - working-directory: pytorch - - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - - name: Test Pytorch binary - uses: ./pytorch/.github/actions/test-pytorch-binary - env: - DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - - name: Teardown XPU - uses: ./.github/actions/teardown-xpu - manywheel-py3_9-xpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_9-xpu-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: xpu - GPU_ARCH_TYPE: xpu - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-xpu - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -673,7 +60,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cpu @@ -695,7 +81,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu build_environment: linux-binary-manywheel @@ -718,7 +103,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu secrets: @@ -735,16 +119,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_6-test: # Testing @@ -759,16 +142,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_6-upload: # Uploading @@ -783,11 +165,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_6 secrets: @@ -804,16 +185,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_8-test: # Testing @@ -828,16 +208,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_8-upload: # Uploading @@ -852,18 +231,17 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-cuda12_9-build: + manywheel-py3_10-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -872,23 +250,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_10-cuda12_9 + build_name: manywheel-py3_10-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cuda12_9-test: # Testing + manywheel-py3_10-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_10-cuda12_9-build + - manywheel-py3_10-cuda13_0-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -896,38 +273,36 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cuda12_9 + build_name: manywheel-py3_10-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cuda12_9-upload: # Uploading + manywheel-py3_10-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_10-cuda12_9-test + needs: manywheel-py3_10-cuda13_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cuda12_9 + build_name: manywheel-py3_10-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -942,11 +317,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-rocm6_3 @@ -966,12 +340,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.10" steps: - name: Setup ROCm @@ -1035,11 +408,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-rocm6_3 secrets: @@ -1056,11 +428,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-rocm6_4 @@ -1080,12 +451,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.10" steps: - name: Setup ROCm @@ -1149,11 +519,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-rocm6_4 secrets: @@ -1173,12 +542,11 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-xpu-test: # Testing @@ -1198,14 +566,13 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.10" permissions: id-token: write contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@main - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -1266,7 +633,6 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-xpu secrets: @@ -1286,7 +652,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cpu @@ -1308,7 +673,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu build_environment: linux-binary-manywheel @@ -1331,7 +695,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu secrets: @@ -1348,16 +711,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_6-test: # Testing @@ -1372,16 +734,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_6-upload: # Uploading @@ -1396,11 +757,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_6 secrets: @@ -1417,16 +777,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_8-test: # Testing @@ -1441,16 +800,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_8-upload: # Uploading @@ -1465,86 +823,17 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-cuda12_8-full-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False - DESIRED_PYTHON: "3.11" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_11-cuda12_8-full - build_environment: linux-binary-manywheel - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cuda12_8-full-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_11-cuda12_8-full-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False - DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cuda12_8-full - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cuda12_8-full-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_11-cuda12_8-full-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False - DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cuda12_8-full - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - - manywheel-py3_11-cuda12_9-build: + manywheel-py3_11-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -1553,23 +842,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_11-cuda12_9 + build_name: manywheel-py3_11-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cuda12_9-test: # Testing + manywheel-py3_11-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_11-cuda12_9-build + - manywheel-py3_11-cuda13_0-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -1577,38 +865,36 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cuda12_9 + build_name: manywheel-py3_11-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cuda12_9-upload: # Uploading + manywheel-py3_11-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_11-cuda12_9-test + needs: manywheel-py3_11-cuda13_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cuda12_9 + build_name: manywheel-py3_11-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -1623,11 +909,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-rocm6_3 @@ -1647,12 +932,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.11" steps: - name: Setup ROCm @@ -1716,11 +1000,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-rocm6_3 secrets: @@ -1737,11 +1020,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-rocm6_4 @@ -1761,12 +1043,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.11" steps: - name: Setup ROCm @@ -1830,11 +1111,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-rocm6_4 secrets: @@ -1854,12 +1134,11 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-xpu-test: # Testing @@ -1879,14 +1158,13 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.11" permissions: id-token: write contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@main - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -1947,7 +1225,6 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-xpu secrets: @@ -1967,7 +1244,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cpu @@ -1989,7 +1265,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu build_environment: linux-binary-manywheel @@ -2012,7 +1287,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu secrets: @@ -2029,16 +1303,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_6-test: # Testing @@ -2053,16 +1326,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_6-upload: # Uploading @@ -2077,11 +1349,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_6 secrets: @@ -2098,16 +1369,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-test: # Testing @@ -2122,16 +1392,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-upload: # Uploading @@ -2146,18 +1415,17 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-cuda12_9-build: + manywheel-py3_12-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2166,23 +1434,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_12-cuda12_9 + build_name: manywheel-py3_12-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-cuda12_9-test: # Testing + manywheel-py3_12-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_12-cuda12_9-build + - manywheel-py3_12-cuda13_0-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -2190,38 +1457,36 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-cuda12_9 + build_name: manywheel-py3_12-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-cuda12_9-upload: # Uploading + manywheel-py3_12-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_12-cuda12_9-test + needs: manywheel-py3_12-cuda13_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-cuda12_9 + build_name: manywheel-py3_12-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -2236,11 +1501,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-rocm6_3 @@ -2260,12 +1524,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.12" steps: - name: Setup ROCm @@ -2329,11 +1592,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-rocm6_3 secrets: @@ -2350,11 +1612,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-rocm6_4 @@ -2374,12 +1635,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.12" steps: - name: Setup ROCm @@ -2443,11 +1703,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-rocm6_4 secrets: @@ -2467,12 +1726,11 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-xpu-test: # Testing @@ -2492,14 +1750,13 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.12" permissions: id-token: write contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@main - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -2560,7 +1817,6 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-xpu secrets: @@ -2580,7 +1836,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cpu @@ -2602,7 +1857,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu build_environment: linux-binary-manywheel @@ -2625,7 +1879,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu secrets: @@ -2642,16 +1895,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_6-test: # Testing @@ -2666,16 +1918,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_6-upload: # Uploading @@ -2690,11 +1941,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_6 secrets: @@ -2711,16 +1961,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_8-test: # Testing @@ -2735,16 +1984,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_8-upload: # Uploading @@ -2759,18 +2007,17 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13-cuda12_9-build: + manywheel-py3_13-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2779,23 +2026,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13-cuda12_9 + build_name: manywheel-py3_13-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cuda12_9-test: # Testing + manywheel-py3_13-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_13-cuda12_9-build + - manywheel-py3_13-cuda13_0-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -2803,38 +2049,36 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cuda12_9 + build_name: manywheel-py3_13-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cuda12_9-upload: # Uploading + manywheel-py3_13-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13-cuda12_9-test + needs: manywheel-py3_13-cuda13_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cuda12_9 + build_name: manywheel-py3_13-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -2849,11 +2093,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-rocm6_3 @@ -2873,12 +2116,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.13" steps: - name: Setup ROCm @@ -2942,11 +2184,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-rocm6_3 secrets: @@ -2963,11 +2204,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-rocm6_4 @@ -2987,12 +2227,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.13" steps: - name: Setup ROCm @@ -3056,11 +2295,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-rocm6_4 secrets: @@ -3080,12 +2318,11 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-xpu-test: # Testing @@ -3105,14 +2342,13 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.13" permissions: id-token: write contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@main - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -3173,7 +2409,6 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-xpu secrets: @@ -3193,7 +2428,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cpu @@ -3215,7 +2449,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu build_environment: linux-binary-manywheel @@ -3238,7 +2471,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu secrets: @@ -3255,16 +2487,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_6-test: # Testing @@ -3279,16 +2510,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_6-upload: # Uploading @@ -3303,11 +2533,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_6 secrets: @@ -3324,16 +2553,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_8-test: # Testing @@ -3348,16 +2576,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_8-upload: # Uploading @@ -3372,18 +2599,17 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13t-cuda12_9-build: + manywheel-py3_13t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -3392,23 +2618,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13t-cuda12_9 + build_name: manywheel-py3_13t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cuda12_9-test: # Testing + manywheel-py3_13t-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_13t-cuda12_9-build + - manywheel-py3_13t-cuda13_0-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -3416,38 +2641,36 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda12_9 + build_name: manywheel-py3_13t-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cuda12_9-upload: # Uploading + manywheel-py3_13t-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13t-cuda12_9-test + needs: manywheel-py3_13t-cuda13_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda12_9 + build_name: manywheel-py3_13t-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -3462,11 +2685,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-rocm6_3 @@ -3486,12 +2708,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.13t" steps: - name: Setup ROCm @@ -3555,11 +2776,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-rocm6_3 secrets: @@ -3576,11 +2796,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-rocm6_4 @@ -3600,12 +2819,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.13t" steps: - name: Setup ROCm @@ -3669,11 +2887,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-rocm6_4 secrets: @@ -3693,12 +2910,11 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-xpu-test: # Testing @@ -3718,14 +2934,13 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.13t" permissions: id-token: write contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@main - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -3786,7 +3001,6 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-xpu secrets: @@ -3806,7 +3020,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cpu @@ -3828,7 +3041,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.14" build_name: manywheel-py3_14-cpu build_environment: linux-binary-manywheel @@ -3851,7 +3063,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.14" build_name: manywheel-py3_14-cpu secrets: @@ -3868,16 +3079,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_6-test: # Testing @@ -3892,16 +3102,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.14" build_name: manywheel-py3_14-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_6-upload: # Uploading @@ -3916,11 +3125,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.14" build_name: manywheel-py3_14-cuda12_6 secrets: @@ -3937,16 +3145,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_8-test: # Testing @@ -3961,16 +3168,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.14" build_name: manywheel-py3_14-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_8-upload: # Uploading @@ -3985,18 +3191,17 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.14" build_name: manywheel-py3_14-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14-cuda12_9-build: + manywheel-py3_14-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -4005,23 +3210,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_14-cuda12_9 + build_name: manywheel-py3_14-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14-cuda12_9-test: # Testing + manywheel-py3_14-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_14-cuda12_9-build + - manywheel-py3_14-cuda13_0-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -4029,38 +3233,36 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14" - build_name: manywheel-py3_14-cuda12_9 + build_name: manywheel-py3_14-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14-cuda12_9-upload: # Uploading + manywheel-py3_14-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_14-cuda12_9-test + needs: manywheel-py3_14-cuda13_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14" - build_name: manywheel-py3_14-cuda12_9 + build_name: manywheel-py3_14-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -4075,11 +3277,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-rocm6_3 @@ -4099,12 +3300,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.14" steps: - name: Setup ROCm @@ -4168,11 +3368,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.14" build_name: manywheel-py3_14-rocm6_3 secrets: @@ -4189,11 +3388,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-rocm6_4 @@ -4213,12 +3411,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.14" steps: - name: Setup ROCm @@ -4282,11 +3479,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.14" build_name: manywheel-py3_14-rocm6_4 secrets: @@ -4306,12 +3502,11 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-xpu-test: # Testing @@ -4331,14 +3526,13 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.14" permissions: id-token: write contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@main - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -4399,7 +3593,6 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.14" build_name: manywheel-py3_14-xpu secrets: @@ -4419,7 +3612,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cpu @@ -4441,7 +3633,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.14t" build_name: manywheel-py3_14t-cpu build_environment: linux-binary-manywheel @@ -4464,7 +3655,6 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - use_split_build: False DESIRED_PYTHON: "3.14t" build_name: manywheel-py3_14t-cpu secrets: @@ -4481,16 +3671,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_6-test: # Testing @@ -4505,16 +3694,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.14t" build_name: manywheel-py3_14t-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_6-upload: # Uploading @@ -4529,11 +3717,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 - use_split_build: False DESIRED_PYTHON: "3.14t" build_name: manywheel-py3_14t-cuda12_6 secrets: @@ -4550,16 +3737,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_8-test: # Testing @@ -4574,16 +3760,15 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.14t" build_name: manywheel-py3_14t-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_8-upload: # Uploading @@ -4598,18 +3783,17 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - use_split_build: False DESIRED_PYTHON: "3.14t" build_name: manywheel-py3_14t-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14t-cuda12_9-build: + manywheel-py3_14t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -4618,23 +3802,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_14t-cuda12_9 + build_name: manywheel-py3_14t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14t-cuda12_9-test: # Testing + manywheel-py3_14t-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_14t-cuda12_9-build + - manywheel-py3_14t-cuda13_0-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -4642,38 +3825,36 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14t" - build_name: manywheel-py3_14t-cuda12_9 + build_name: manywheel-py3_14t-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14t-cuda12_9-upload: # Uploading + manywheel-py3_14t-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_14t-cuda12_9-test + needs: manywheel-py3_14t-cuda13_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - use_split_build: False + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14t" - build_name: manywheel-py3_14t-cuda12_9 + build_name: manywheel-py3_14t-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -4688,11 +3869,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-rocm6_3 @@ -4712,12 +3892,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.14t" steps: - name: Setup ROCm @@ -4781,11 +3960,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: 6.3 + GPU_ARCH_VERSION: "6.3" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.3 - use_split_build: False DESIRED_PYTHON: "3.14t" build_name: manywheel-py3_14t-rocm6_3 secrets: @@ -4802,11 +3980,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-rocm6_4 @@ -4826,12 +4003,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.14t" steps: - name: Setup ROCm @@ -4895,11 +4071,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.14t" build_name: manywheel-py3_14t-rocm6_4 secrets: @@ -4919,12 +4094,11 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-xpu-test: # Testing @@ -4944,14 +4118,13 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.14t" permissions: id-token: write contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@main - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -5012,7 +4185,6 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu - use_split_build: False DESIRED_PYTHON: "3.14t" build_name: manywheel-py3_14t-xpu secrets: diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml index b6b63c4e38d5e..8177bac3fe216 100644 --- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml @@ -54,11 +54,10 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.9" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_9-rocm6_4 @@ -78,12 +77,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: 6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - use_split_build: False DESIRED_PYTHON: "3.9" steps: - name: Setup ROCm diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml index 66c0813afe900..4a7ebe8366336 100644 --- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml @@ -47,7 +47,7 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - manywheel-py3_9-cpu-s390x-build: + manywheel-py3_10-cpu-s390x-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -60,19 +60,18 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 - build_name: manywheel-py3_9-cpu-s390x + build_name: manywheel-py3_10-cpu-s390x build_environment: linux-s390x-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cpu-s390x-test: # Testing + manywheel-py3_10-cpu-s390x-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_9-cpu-s390x-build + - manywheel-py3_10-cpu-s390x-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -84,20 +83,19 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cpu-s390x + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cpu-s390x build_environment: linux-s390x-binary-manywheel runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-cpu-s390x-upload: # Uploading + manywheel-py3_10-cpu-s390x-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_9-cpu-s390x-test + needs: manywheel-py3_10-cpu-s390x-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel @@ -107,14 +105,13 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-cpu-s390x + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cpu-s390x secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-cpu-s390x-build: + manywheel-py3_11-cpu-s390x-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -127,19 +124,18 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.11" runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 - build_name: manywheel-py3_10-cpu-s390x + build_name: manywheel-py3_11-cpu-s390x build_environment: linux-s390x-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cpu-s390x-test: # Testing + manywheel-py3_11-cpu-s390x-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_10-cpu-s390x-build + - manywheel-py3_11-cpu-s390x-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -151,20 +147,19 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cpu-s390x + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cpu-s390x build_environment: linux-s390x-binary-manywheel runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cpu-s390x-upload: # Uploading + manywheel-py3_11-cpu-s390x-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_10-cpu-s390x-test + needs: manywheel-py3_11-cpu-s390x-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel @@ -174,14 +169,13 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cpu-s390x + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cpu-s390x secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-cpu-s390x-build: + manywheel-py3_12-cpu-s390x-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -194,19 +188,18 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 - build_name: manywheel-py3_11-cpu-s390x + build_name: manywheel-py3_12-cpu-s390x build_environment: linux-s390x-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cpu-s390x-test: # Testing + manywheel-py3_12-cpu-s390x-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_11-cpu-s390x-build + - manywheel-py3_12-cpu-s390x-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -218,20 +211,19 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cpu-s390x + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cpu-s390x build_environment: linux-s390x-binary-manywheel runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cpu-s390x-upload: # Uploading + manywheel-py3_12-cpu-s390x-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_11-cpu-s390x-test + needs: manywheel-py3_12-cpu-s390x-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel @@ -241,14 +233,13 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cpu-s390x + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cpu-s390x secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-cpu-s390x-build: + manywheel-py3_13-cpu-s390x-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -261,19 +252,18 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13" runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 - build_name: manywheel-py3_12-cpu-s390x + build_name: manywheel-py3_13-cpu-s390x build_environment: linux-s390x-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-cpu-s390x-test: # Testing + manywheel-py3_13-cpu-s390x-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_12-cpu-s390x-build + - manywheel-py3_13-cpu-s390x-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -285,20 +275,19 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-cpu-s390x + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cpu-s390x build_environment: linux-s390x-binary-manywheel runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-cpu-s390x-upload: # Uploading + manywheel-py3_13-cpu-s390x-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_12-cpu-s390x-test + needs: manywheel-py3_13-cpu-s390x-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel @@ -308,14 +297,13 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-cpu-s390x + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cpu-s390x secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13-cpu-s390x-build: + manywheel-py3_13t-cpu-s390x-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -328,19 +316,18 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.13t" runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 - build_name: manywheel-py3_13-cpu-s390x + build_name: manywheel-py3_13t-cpu-s390x build_environment: linux-s390x-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cpu-s390x-test: # Testing + manywheel-py3_13t-cpu-s390x-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_13-cpu-s390x-build + - manywheel-py3_13t-cpu-s390x-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -352,20 +339,19 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cpu-s390x + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cpu-s390x build_environment: linux-s390x-binary-manywheel runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cpu-s390x-upload: # Uploading + manywheel-py3_13t-cpu-s390x-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13-cpu-s390x-test + needs: manywheel-py3_13t-cpu-s390x-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel @@ -375,9 +361,136 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x - use_split_build: False - DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cpu-s390x + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cpu-s390x + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14-cpu-s390x-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14" + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + timeout-minutes: 420 + build_name: manywheel-py3_14-cpu-s390x + build_environment: linux-s390x-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cpu-s390x-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_14-cpu-s390x-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cpu-s390x + build_environment: linux-s390x-binary-manywheel + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cpu-s390x-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cpu-s390x-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cpu-s390x + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14t-cpu-s390x-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14t" + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + timeout-minutes: 420 + build_name: manywheel-py3_14t-cpu-s390x + build_environment: linux-s390x-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cpu-s390x-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_14t-cpu-s390x-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cpu-s390x + build_environment: linux-s390x-binary-manywheel + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cpu-s390x-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cpu-s390x-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cpu-s390x secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml index ad7a1cf1d71df..500f8fa07af6b 100644 --- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml @@ -46,7 +46,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -67,11 +67,6 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index 7c4cc4ab55176..6aee57b503aa2 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -30,7 +30,7 @@ concurrency: cancel-in-progress: true jobs: - wheel-py3_9-cpu-build: + wheel-py3_10-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: macos-14-xlarge timeout-minutes: 240 @@ -42,7 +42,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -63,11 +63,6 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -115,12 +110,33 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" - if [[ $DESIRED_PYTHON == "3.13t" ]]; then - conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge - SMOKE_TEST_PARAMS="--torch-compile-check disabled" - else - conda create -yn "test_conda_env" python="$DESIRED_PYTHON" - fi + + EXTRA_CONDA_INSTALL_FLAGS="" + CONDA_ENV_CREATE_FLAGS="" + # shellcheck disable=SC2153 + case $DESIRED_PYTHON in + 3.14t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.14) + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.13t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" + desired_python="3.13" + ;; + *) + # shellcheck disable=SC2153 + desired_python=${DESIRED_PYTHON} + ;; + esac + + # shellcheck disable=SC2086 + conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} conda activate test_conda_env pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v @@ -129,16 +145,16 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_9-cpu + name: wheel-py3_10-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_9-cpu-upload: # Uploading + wheel-py3_10-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_9-cpu-build + needs: wheel-py3_10-cpu-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: wheel @@ -148,13 +164,13 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - DESIRED_PYTHON: "3.9" - build_name: wheel-py3_9-cpu + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cpu use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cpu-build: + wheel-py3_11-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: macos-14-xlarge timeout-minutes: 240 @@ -166,7 +182,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.11" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -187,11 +203,6 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -239,12 +250,33 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" - if [[ $DESIRED_PYTHON == "3.13t" ]]; then - conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge - SMOKE_TEST_PARAMS="--torch-compile-check disabled" - else - conda create -yn "test_conda_env" python="$DESIRED_PYTHON" - fi + + EXTRA_CONDA_INSTALL_FLAGS="" + CONDA_ENV_CREATE_FLAGS="" + # shellcheck disable=SC2153 + case $DESIRED_PYTHON in + 3.14t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.14) + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.13t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" + desired_python="3.13" + ;; + *) + # shellcheck disable=SC2153 + desired_python=${DESIRED_PYTHON} + ;; + esac + + # shellcheck disable=SC2086 + conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} conda activate test_conda_env pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v @@ -253,16 +285,16 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cpu + name: wheel-py3_11-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_10-cpu-upload: # Uploading + wheel-py3_11-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cpu-build + needs: wheel-py3_11-cpu-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: wheel @@ -272,13 +304,13 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cpu + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cpu use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cpu-build: + wheel-py3_12-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: macos-14-xlarge timeout-minutes: 240 @@ -290,7 +322,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -311,11 +343,6 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -363,12 +390,33 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" - if [[ $DESIRED_PYTHON == "3.13t" ]]; then - conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge - SMOKE_TEST_PARAMS="--torch-compile-check disabled" - else - conda create -yn "test_conda_env" python="$DESIRED_PYTHON" - fi + + EXTRA_CONDA_INSTALL_FLAGS="" + CONDA_ENV_CREATE_FLAGS="" + # shellcheck disable=SC2153 + case $DESIRED_PYTHON in + 3.14t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.14) + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.13t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" + desired_python="3.13" + ;; + *) + # shellcheck disable=SC2153 + desired_python=${DESIRED_PYTHON} + ;; + esac + + # shellcheck disable=SC2086 + conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} conda activate test_conda_env pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v @@ -377,16 +425,16 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cpu + name: wheel-py3_12-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_11-cpu-upload: # Uploading + wheel-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cpu-build + needs: wheel-py3_12-cpu-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: wheel @@ -396,13 +444,13 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cpu + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cpu use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cpu-build: + wheel-py3_13-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: macos-14-xlarge timeout-minutes: 240 @@ -414,7 +462,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -435,11 +483,6 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -487,12 +530,33 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" - if [[ $DESIRED_PYTHON == "3.13t" ]]; then - conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge - SMOKE_TEST_PARAMS="--torch-compile-check disabled" - else - conda create -yn "test_conda_env" python="$DESIRED_PYTHON" - fi + + EXTRA_CONDA_INSTALL_FLAGS="" + CONDA_ENV_CREATE_FLAGS="" + # shellcheck disable=SC2153 + case $DESIRED_PYTHON in + 3.14t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.14) + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.13t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" + desired_python="3.13" + ;; + *) + # shellcheck disable=SC2153 + desired_python=${DESIRED_PYTHON} + ;; + esac + + # shellcheck disable=SC2086 + conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} conda activate test_conda_env pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v @@ -501,16 +565,16 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cpu + name: wheel-py3_13-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_12-cpu-upload: # Uploading + wheel-py3_13-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cpu-build + needs: wheel-py3_13-cpu-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: wheel @@ -520,13 +584,13 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cpu + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-cpu use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cpu-build: + wheel-py3_13t-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: macos-14-xlarge timeout-minutes: 240 @@ -538,7 +602,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.13t" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -559,11 +623,6 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -611,12 +670,33 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" - if [[ $DESIRED_PYTHON == "3.13t" ]]; then - conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge - SMOKE_TEST_PARAMS="--torch-compile-check disabled" - else - conda create -yn "test_conda_env" python="$DESIRED_PYTHON" - fi + + EXTRA_CONDA_INSTALL_FLAGS="" + CONDA_ENV_CREATE_FLAGS="" + # shellcheck disable=SC2153 + case $DESIRED_PYTHON in + 3.14t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.14) + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.13t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" + desired_python="3.13" + ;; + *) + # shellcheck disable=SC2153 + desired_python=${DESIRED_PYTHON} + ;; + esac + + # shellcheck disable=SC2086 + conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} conda activate test_conda_env pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v @@ -625,16 +705,16 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cpu + name: wheel-py3_13t-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_13-cpu-upload: # Uploading + wheel-py3_13t-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cpu-build + needs: wheel-py3_13t-cpu-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: wheel @@ -644,13 +724,13 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cpu + DESIRED_PYTHON: "3.13t" + build_name: wheel-py3_13t-cpu use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cpu-build: + wheel-py3_14-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: macos-14-xlarge timeout-minutes: 240 @@ -662,7 +742,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -683,11 +763,6 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: @@ -735,12 +810,33 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" - if [[ $DESIRED_PYTHON == "3.13t" ]]; then - conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge - SMOKE_TEST_PARAMS="--torch-compile-check disabled" - else - conda create -yn "test_conda_env" python="$DESIRED_PYTHON" - fi + + EXTRA_CONDA_INSTALL_FLAGS="" + CONDA_ENV_CREATE_FLAGS="" + # shellcheck disable=SC2153 + case $DESIRED_PYTHON in + 3.14t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.14) + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.13t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" + desired_python="3.13" + ;; + *) + # shellcheck disable=SC2153 + desired_python=${DESIRED_PYTHON} + ;; + esac + + # shellcheck disable=SC2086 + conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} conda activate test_conda_env pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v @@ -749,16 +845,16 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cpu + name: wheel-py3_14-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_13t-cpu-upload: # Uploading + wheel-py3_14-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cpu-build + needs: wheel-py3_14-cpu-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: wheel @@ -768,8 +864,148 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu - DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cpu + DESIRED_PYTHON: "3.14" + build_name: wheel-py3_14-cpu + use_s3: False + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_14t-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-14-xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.14t" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + set -eux -o pipefail + # shellcheck disable=SC1090 + source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" + mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" + + # Build + USE_PYTORCH_METAL_EXPORT=1 + USE_COREML_DELEGATE=1 + TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}" + export USE_PYTORCH_METAL_EXPORT + export USE_COREML_DELEGATE + export TORCH_PACKAGE_NAME + "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" + - name: Test PyTorch wheel + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + set -eux -o pipefail + # shellcheck disable=SC1090 + source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" + pip uninstall -y "$TORCH_PACKAGE_NAME" || true + pip uninstall -y "$TORCH_PACKAGE_NAME" || true + + # Create new "clean" conda environment for testing + + SMOKE_TEST_PARAMS="" + + EXTRA_CONDA_INSTALL_FLAGS="" + CONDA_ENV_CREATE_FLAGS="" + # shellcheck disable=SC2153 + case $DESIRED_PYTHON in + 3.14t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.14) + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" + desired_python="3.14.0rc1" + ;; + 3.13t) + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" + desired_python="3.13" + ;; + *) + # shellcheck disable=SC2153 + desired_python=${DESIRED_PYTHON} + ;; + esac + + # shellcheck disable=SC2086 + conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} + conda activate test_conda_env + pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v + + # shellcheck disable=SC2086 + python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS} + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_14t-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_14t-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_14t-cpu-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + DESIRED_PYTHON: "3.14t" + build_name: wheel-py3_14t-cpu use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml index 2c86e7e103598..7c26dbc3b9eea 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml @@ -51,7 +51,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -64,7 +64,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Populate binary env shell: cmd @@ -128,7 +128,7 @@ jobs: - libtorch-cpu-shared-with-deps-debug-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -141,7 +141,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Populate binary env shell: cmd @@ -201,7 +201,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cpu-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml index 912a452f0ee8a..5e30b66183840 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml @@ -51,7 +51,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -64,7 +64,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Populate binary env shell: cmd @@ -128,7 +128,7 @@ jobs: - libtorch-cpu-shared-with-deps-release-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -141,7 +141,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Populate binary env shell: cmd @@ -201,7 +201,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cpu-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml index 1dd70d0d06a91..1368bc942350e 100644 --- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml @@ -51,7 +51,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -124,7 +124,7 @@ jobs: - wheel-py3_11-cpu-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -198,7 +198,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -271,7 +271,7 @@ jobs: - wheel-py3_12-cpu-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -345,7 +345,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -418,7 +418,7 @@ jobs: - wheel-py3_13-cpu-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml index ac15a9f3e97ac..818d2ca45cc4c 100644 --- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml +++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml @@ -38,7 +38,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -51,7 +51,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -153,7 +153,7 @@ jobs: - libtorch-cpu-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -166,7 +166,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml index 75c393b46e59b..67fdecdf6e866 100644 --- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml @@ -45,7 +45,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -58,7 +58,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -160,7 +160,7 @@ jobs: - libtorch-cpu-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -173,7 +173,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -283,7 +283,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cpu-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -292,21 +292,21 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -408,21 +408,21 @@ jobs: - libtorch-cuda12_6-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -527,13 +527,13 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cuda12_6-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -542,21 +542,21 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -658,21 +658,21 @@ jobs: - libtorch-cuda12_8-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -777,36 +777,36 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cuda12_8-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - libtorch-cuda12_9-shared-with-deps-debug-build: + libtorch-cuda13_0-shared-with-deps-debug-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -884,7 +884,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: libtorch-cuda12_9-shared-with-deps-debug + name: libtorch-cuda13_0-shared-with-deps-debug retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -902,27 +902,27 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda12_9-shared-with-deps-debug-test: # Testing + libtorch-cuda13_0-shared-with-deps-debug-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - libtorch-cuda12_9-shared-with-deps-debug-build + - libtorch-cuda13_0-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -992,7 +992,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: libtorch-cuda12_9-shared-with-deps-debug + name: libtorch-cuda13_0-shared-with-deps-debug path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1015,26 +1015,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda12_9-shared-with-deps-debug-upload: # Uploading + libtorch-cuda13_0-shared-with-deps-debug-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: libtorch-cuda12_9-shared-with-deps-debug-test + needs: libtorch-cuda13_0-shared-with-deps-debug-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" - build_name: libtorch-cuda12_9-shared-with-deps-debug + DESIRED_PYTHON: "3.10" + build_name: libtorch-cuda13_0-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml index 9a0a3496e37b3..ff8a2bbbfe1ef 100644 --- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml +++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml @@ -38,7 +38,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -51,7 +51,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -153,7 +153,7 @@ jobs: - libtorch-cpu-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -166,7 +166,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml index eccd332c74a1f..8efca3b7571b9 100644 --- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml @@ -45,7 +45,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -58,7 +58,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -160,7 +160,7 @@ jobs: - libtorch-cpu-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -173,7 +173,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -283,7 +283,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cpu-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -292,21 +292,21 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -408,21 +408,21 @@ jobs: - libtorch-cuda12_6-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -527,13 +527,13 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cuda12_6-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -542,21 +542,21 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -658,21 +658,21 @@ jobs: - libtorch-cuda12_8-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -777,36 +777,36 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cuda12_8-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - libtorch-cuda12_9-shared-with-deps-release-build: + libtorch-cuda13_0-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -884,7 +884,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: libtorch-cuda12_9-shared-with-deps-release + name: libtorch-cuda13_0-shared-with-deps-release retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -902,27 +902,27 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda12_9-shared-with-deps-release-test: # Testing + libtorch-cuda13_0-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - libtorch-cuda12_9-shared-with-deps-release-build + - libtorch-cuda13_0-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -992,7 +992,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: libtorch-cuda12_9-shared-with-deps-release + name: libtorch-cuda13_0-shared-with-deps-release path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1015,26 +1015,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading + libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: libtorch-cuda12_9-shared-with-deps-release-test + needs: libtorch-cuda13_0-shared-with-deps-release-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" - build_name: libtorch-cuda12_9-shared-with-deps-release + DESIRED_PYTHON: "3.10" + build_name: libtorch-cuda13_0-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml index 22ebe8db70eac..154dadbe6a1e3 100644 --- a/.github/workflows/generated-windows-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml @@ -41,11 +41,1196 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - wheel-py3_9-cpu-build: + wheel-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_10-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_10-cpu-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_10-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-cpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-cuda12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6" + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_10-cuda12_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_10-cuda12_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_10-cuda12_6-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6" + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_10-cuda12_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-cuda12_6-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6" + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cuda12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-cuda12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8" + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_10-cuda12_8 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_10-cuda12_8-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_10-cuda12_8-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8" + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_10-cuda12_8 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-cuda12_8-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8" + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cuda12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-cuda13_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_10-cuda13_0 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_10-cuda13_0-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_10-cuda13_0-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_10-cuda13_0 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda13_0-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-cuda13_0-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cuda13_0 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-xpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_10-xpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_10-xpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_10-xpu-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 360 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_10-xpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-xpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-xpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-xpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_11-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -54,7 +1239,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.11" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -132,7 +1317,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_9-cpu + name: wheel-py3_11-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -150,13 +1335,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cpu-test: # Testing + wheel-py3_11-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_9-cpu-build + - wheel-py3_11-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -165,7 +1350,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -235,7 +1420,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_9-cpu + name: wheel-py3_11-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -258,12 +1443,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cpu-upload: # Uploading + wheel-py3_11-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_9-cpu-test + needs: wheel-py3_11-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -271,26 +1456,26 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu - DESIRED_PYTHON: "3.9" - build_name: wheel-py3_9-cpu + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_9-cuda12_6-build: + wheel-py3_11-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.11" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -368,7 +1553,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_9-cuda12_6 + name: wheel-py3_11-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -386,23 +1571,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda12_6-test: # Testing + wheel-py3_11-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_9-cuda12_6-build + - wheel-py3_11-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -472,7 +1657,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_9-cuda12_6 + name: wheel-py3_11-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -495,40 +1680,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda12_6-upload: # Uploading + wheel-py3_11-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_9-cuda12_6-test + needs: wheel-py3_11-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.9" - build_name: wheel-py3_9-cuda12_6 + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_9-cuda12_8-build: + wheel-py3_11-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.11" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -606,7 +1791,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_9-cuda12_8 + name: wheel-py3_11-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -624,23 +1809,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda12_8-test: # Testing + wheel-py3_11-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_9-cuda12_8-build + - wheel-py3_11-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -710,7 +1895,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_9-cuda12_8 + name: wheel-py3_11-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -733,40 +1918,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda12_8-upload: # Uploading + wheel-py3_11-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_9-cuda12_8-test + needs: wheel-py3_11-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.9" - build_name: wheel-py3_9-cuda12_8 + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_9-cuda12_9-build: + wheel-py3_11-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.11" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -844,7 +2029,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_9-cuda12_9 + name: wheel-py3_11-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -862,23 +2047,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda12_9-test: # Testing + wheel-py3_11-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_9-cuda12_9-build + - wheel-py3_11-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -948,7 +2133,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_9-cuda12_9 + name: wheel-py3_11-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -971,30 +2156,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda12_9-upload: # Uploading + wheel-py3_11-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_9-cuda12_9-test + needs: wheel-py3_11-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.9" - build_name: wheel-py3_9-cuda12_9 + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_9-xpu-build: + wheel-py3_11-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1003,8 +2188,8 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + DESIRED_PYTHON: "3.11" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -1082,7 +2267,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_9-xpu + name: wheel-py3_11-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1100,13 +2285,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-xpu-test: # Testing + wheel-py3_11-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_9-xpu-build + - wheel-py3_11-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1115,7 +2300,7 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -1185,7 +2370,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_9-xpu + name: wheel-py3_11-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1208,12 +2393,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-xpu-upload: # Uploading + wheel-py3_11-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_9-xpu-test + needs: wheel-py3_11-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1221,16 +2406,16 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu - DESIRED_PYTHON: "3.9" - build_name: wheel-py3_9-xpu + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cpu-build: + wheel-py3_12-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1239,7 +2424,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.12" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -1317,7 +2502,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cpu + name: wheel-py3_12-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1335,13 +2520,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cpu-test: # Testing + wheel-py3_12-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-cpu-build + - wheel-py3_12-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1350,7 +2535,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -1420,7 +2605,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-cpu + name: wheel-py3_12-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1443,12 +2628,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cpu-upload: # Uploading + wheel-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cpu-test + needs: wheel-py3_12-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1456,26 +2641,26 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cpu + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cuda12_6-build: + wheel-py3_12-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.12" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -1553,7 +2738,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cuda12_6 + name: wheel-py3_12-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1571,23 +2756,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_6-test: # Testing + wheel-py3_12-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-cuda12_6-build + - wheel-py3_12-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -1657,7 +2842,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-cuda12_6 + name: wheel-py3_12-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1680,40 +2865,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_6-upload: # Uploading + wheel-py3_12-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cuda12_6-test + needs: wheel-py3_12-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cuda12_6 + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cuda12_8-build: + wheel-py3_12-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.12" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -1791,7 +2976,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cuda12_8 + name: wheel-py3_12-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1809,23 +2994,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_8-test: # Testing + wheel-py3_12-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-cuda12_8-build + - wheel-py3_12-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -1895,7 +3080,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-cuda12_8 + name: wheel-py3_12-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1918,40 +3103,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_8-upload: # Uploading + wheel-py3_12-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cuda12_8-test + needs: wheel-py3_12-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cuda12_8 + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cuda12_9-build: + wheel-py3_12-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.12" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -2029,7 +3214,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cuda12_9 + name: wheel-py3_12-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2047,23 +3232,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_9-test: # Testing + wheel-py3_12-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-cuda12_9-build + - wheel-py3_12-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -2133,7 +3318,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-cuda12_9 + name: wheel-py3_12-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -2156,30 +3341,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_9-upload: # Uploading + wheel-py3_12-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cuda12_9-test + needs: wheel-py3_12-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cuda12_9 + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-xpu-build: + wheel-py3_12-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2188,8 +3373,8 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + DESIRED_PYTHON: "3.12" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -2267,7 +3452,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-xpu + name: wheel-py3_12-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2285,13 +3470,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-xpu-test: # Testing + wheel-py3_12-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-xpu-build + - wheel-py3_12-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2300,7 +3485,7 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -2370,7 +3555,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-xpu + name: wheel-py3_12-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -2393,12 +3578,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-xpu-upload: # Uploading + wheel-py3_12-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-xpu-test + needs: wheel-py3_12-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2406,16 +3591,16 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-xpu + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cpu-build: + wheel-py3_13-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2424,7 +3609,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.13" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -2502,7 +3687,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cpu + name: wheel-py3_13-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2520,13 +3705,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cpu-test: # Testing + wheel-py3_13-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-cpu-build + - wheel-py3_13-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2535,7 +3720,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.13" steps: - name: Display EC2 information shell: bash @@ -2605,7 +3790,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-cpu + name: wheel-py3_13-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -2628,12 +3813,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cpu-upload: # Uploading + wheel-py3_13-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cpu-test + needs: wheel-py3_13-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2641,26 +3826,26 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cpu + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cuda12_6-build: + wheel-py3_13-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.13" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -2738,7 +3923,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cuda12_6 + name: wheel-py3_13-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2756,23 +3941,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_6-test: # Testing + wheel-py3_13-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-cuda12_6-build + - wheel-py3_13-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.13" steps: - name: Display EC2 information shell: bash @@ -2842,7 +4027,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-cuda12_6 + name: wheel-py3_13-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -2865,40 +4050,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_6-upload: # Uploading + wheel-py3_13-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cuda12_6-test + needs: wheel-py3_13-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cuda12_6 + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cuda12_8-build: + wheel-py3_13-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.13" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -2976,7 +4161,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cuda12_8 + name: wheel-py3_13-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2994,23 +4179,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_8-test: # Testing + wheel-py3_13-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-cuda12_8-build + - wheel-py3_13-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.13" steps: - name: Display EC2 information shell: bash @@ -3080,7 +4265,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-cuda12_8 + name: wheel-py3_13-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -3103,40 +4288,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_8-upload: # Uploading + wheel-py3_13-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cuda12_8-test + needs: wheel-py3_13-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cuda12_8 + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cuda12_9-build: + wheel-py3_13-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.13" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -3214,7 +4399,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cuda12_9 + name: wheel-py3_13-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3232,23 +4417,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_9-test: # Testing + wheel-py3_13-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-cuda12_9-build + - wheel-py3_13-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.13" steps: - name: Display EC2 information shell: bash @@ -3318,7 +4503,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-cuda12_9 + name: wheel-py3_13-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -3341,30 +4526,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_9-upload: # Uploading + wheel-py3_13-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cuda12_9-test + needs: wheel-py3_13-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cuda12_9 + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-xpu-build: + wheel-py3_13-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3373,8 +4558,8 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + DESIRED_PYTHON: "3.13" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -3452,7 +4637,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-xpu + name: wheel-py3_13-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3470,13 +4655,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-xpu-test: # Testing + wheel-py3_13-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-xpu-build + - wheel-py3_13-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3485,7 +4670,7 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.13" steps: - name: Display EC2 information shell: bash @@ -3555,7 +4740,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-xpu + name: wheel-py3_13-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -3578,12 +4763,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-xpu-upload: # Uploading + wheel-py3_13-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-xpu-test + needs: wheel-py3_13-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3591,16 +4776,16 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-xpu + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cpu-build: + wheel-py3_13t-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3609,7 +4794,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13t" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -3687,7 +4872,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cpu + name: wheel-py3_13t-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3705,13 +4890,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cpu-test: # Testing + wheel-py3_13t-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-cpu-build + - wheel-py3_13t-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3720,7 +4905,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13t" steps: - name: Display EC2 information shell: bash @@ -3790,7 +4975,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-cpu + name: wheel-py3_13t-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -3813,12 +4998,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cpu-upload: # Uploading + wheel-py3_13t-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cpu-test + needs: wheel-py3_13t-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3826,26 +5011,26 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu - DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cpu + DESIRED_PYTHON: "3.13t" + build_name: wheel-py3_13t-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cuda12_6-build: + wheel-py3_13t-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13t" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -3923,7 +5108,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cuda12_6 + name: wheel-py3_13t-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3941,23 +5126,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_6-test: # Testing + wheel-py3_13t-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-cuda12_6-build + - wheel-py3_13t-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13t" steps: - name: Display EC2 information shell: bash @@ -4027,7 +5212,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-cuda12_6 + name: wheel-py3_13t-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -4050,40 +5235,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_6-upload: # Uploading + wheel-py3_13t-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cuda12_6-test + needs: wheel-py3_13t-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cuda12_6 + DESIRED_PYTHON: "3.13t" + build_name: wheel-py3_13t-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cuda12_8-build: + wheel-py3_13t-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13t" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -4161,7 +5346,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cuda12_8 + name: wheel-py3_13t-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4179,23 +5364,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_8-test: # Testing + wheel-py3_13t-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-cuda12_8-build + - wheel-py3_13t-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13t" steps: - name: Display EC2 information shell: bash @@ -4265,7 +5450,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-cuda12_8 + name: wheel-py3_13t-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -4288,40 +5473,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_8-upload: # Uploading + wheel-py3_13t-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cuda12_8-test + needs: wheel-py3_13t-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cuda12_8 + DESIRED_PYTHON: "3.13t" + build_name: wheel-py3_13t-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cuda12_9-build: + wheel-py3_13t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13t" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -4399,7 +5584,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cuda12_9 + name: wheel-py3_13t-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4417,23 +5602,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_9-test: # Testing + wheel-py3_13t-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-cuda12_9-build + - wheel-py3_13t-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13t" steps: - name: Display EC2 information shell: bash @@ -4503,7 +5688,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-cuda12_9 + name: wheel-py3_13t-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -4526,30 +5711,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_9-upload: # Uploading + wheel-py3_13t-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cuda12_9-test + needs: wheel-py3_13t-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cuda12_9 + DESIRED_PYTHON: "3.13t" + build_name: wheel-py3_13t-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-xpu-build: + wheel-py3_13t-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4558,8 +5743,8 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + DESIRED_PYTHON: "3.13t" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -4637,7 +5822,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-xpu + name: wheel-py3_13t-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4655,13 +5840,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-xpu-test: # Testing + wheel-py3_13t-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-xpu-build + - wheel-py3_13t-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4670,7 +5855,7 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" + DESIRED_PYTHON: "3.13t" steps: - name: Display EC2 information shell: bash @@ -4740,7 +5925,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-xpu + name: wheel-py3_13t-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -4763,12 +5948,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-xpu-upload: # Uploading + wheel-py3_13t-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-xpu-test + needs: wheel-py3_13t-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4776,16 +5961,16 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu - DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-xpu + DESIRED_PYTHON: "3.13t" + build_name: wheel-py3_13t-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cpu-build: + wheel-py3_14-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4794,7 +5979,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.14" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -4872,7 +6057,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cpu + name: wheel-py3_14-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4890,13 +6075,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cpu-test: # Testing + wheel-py3_14-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-cpu-build + - wheel-py3_14-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4905,7 +6090,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.14" steps: - name: Display EC2 information shell: bash @@ -4975,7 +6160,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-cpu + name: wheel-py3_14-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -4998,12 +6183,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cpu-upload: # Uploading + wheel-py3_14-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cpu-test + needs: wheel-py3_14-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5011,26 +6196,26 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu - DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cpu + DESIRED_PYTHON: "3.14" + build_name: wheel-py3_14-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cuda12_6-build: + wheel-py3_14-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.14" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -5108,7 +6293,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cuda12_6 + name: wheel-py3_14-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5126,23 +6311,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_6-test: # Testing + wheel-py3_14-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-cuda12_6-build + - wheel-py3_14-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.14" steps: - name: Display EC2 information shell: bash @@ -5212,7 +6397,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-cuda12_6 + name: wheel-py3_14-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -5235,40 +6420,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_6-upload: # Uploading + wheel-py3_14-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cuda12_6-test + needs: wheel-py3_14-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cuda12_6 + DESIRED_PYTHON: "3.14" + build_name: wheel-py3_14-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cuda12_8-build: + wheel-py3_14-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.14" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -5346,7 +6531,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cuda12_8 + name: wheel-py3_14-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5364,23 +6549,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_8-test: # Testing + wheel-py3_14-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-cuda12_8-build + - wheel-py3_14-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.14" steps: - name: Display EC2 information shell: bash @@ -5450,7 +6635,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-cuda12_8 + name: wheel-py3_14-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -5473,40 +6658,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_8-upload: # Uploading + wheel-py3_14-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cuda12_8-test + needs: wheel-py3_14-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cuda12_8 + DESIRED_PYTHON: "3.14" + build_name: wheel-py3_14-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cuda12_9-build: + wheel-py3_14-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.14" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -5584,7 +6769,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cuda12_9 + name: wheel-py3_14-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5602,23 +6787,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_9-test: # Testing + wheel-py3_14-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-cuda12_9-build + - wheel-py3_14-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.14" steps: - name: Display EC2 information shell: bash @@ -5688,7 +6873,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-cuda12_9 + name: wheel-py3_14-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -5711,30 +6896,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_9-upload: # Uploading + wheel-py3_14-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cuda12_9-test + needs: wheel-py3_14-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cuda12_9 + DESIRED_PYTHON: "3.14" + build_name: wheel-py3_14-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-xpu-build: + wheel-py3_14-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5743,8 +6928,8 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + DESIRED_PYTHON: "3.14" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -5822,7 +7007,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-xpu + name: wheel-py3_14-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5840,13 +7025,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-xpu-test: # Testing + wheel-py3_14-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-xpu-build + - wheel-py3_14-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5855,7 +7040,7 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13" + DESIRED_PYTHON: "3.14" steps: - name: Display EC2 information shell: bash @@ -5925,7 +7110,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-xpu + name: wheel-py3_14-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -5948,12 +7133,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-xpu-upload: # Uploading + wheel-py3_14-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-xpu-test + needs: wheel-py3_14-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5961,16 +7146,16 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu - DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-xpu + DESIRED_PYTHON: "3.14" + build_name: wheel-py3_14-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cpu-build: + wheel-py3_14t-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5979,7 +7164,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14t" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -6057,7 +7242,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cpu + name: wheel-py3_14t-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -6075,13 +7260,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cpu-test: # Testing + wheel-py3_14t-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-cpu-build + - wheel-py3_14t-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6090,7 +7275,7 @@ jobs: DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14t" steps: - name: Display EC2 information shell: bash @@ -6160,7 +7345,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-cpu + name: wheel-py3_14t-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -6183,12 +7368,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cpu-upload: # Uploading + wheel-py3_14t-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cpu-test + needs: wheel-py3_14t-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6196,26 +7381,26 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu GPU_ARCH_TYPE: cpu - DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cpu + DESIRED_PYTHON: "3.14t" + build_name: wheel-py3_14t-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cuda12_6-build: + wheel-py3_14t-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14t" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -6293,7 +7478,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cuda12_6 + name: wheel-py3_14t-cuda12_6 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -6311,23 +7496,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_6-test: # Testing + wheel-py3_14t-cuda12_6-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-cuda12_6-build + - wheel-py3_14t-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14t" steps: - name: Display EC2 information shell: bash @@ -6397,7 +7582,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-cuda12_6 + name: wheel-py3_14t-cuda12_6 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -6420,40 +7605,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_6-upload: # Uploading + wheel-py3_14t-cuda12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cuda12_6-test + needs: wheel-py3_14t-cuda12_6-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: 12.6 + GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cuda12_6 + DESIRED_PYTHON: "3.14t" + build_name: wheel-py3_14t-cuda12_6 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cuda12_8-build: + wheel-py3_14t-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14t" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -6531,7 +7716,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cuda12_8 + name: wheel-py3_14t-cuda12_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -6549,23 +7734,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_8-test: # Testing + wheel-py3_14t-cuda12_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-cuda12_8-build + - wheel-py3_14t-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14t" steps: - name: Display EC2 information shell: bash @@ -6635,7 +7820,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-cuda12_8 + name: wheel-py3_14t-cuda12_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -6658,40 +7843,40 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_8-upload: # Uploading + wheel-py3_14t-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cuda12_8-test + needs: wheel-py3_14t-cuda12_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: 12.8 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cuda12_8 + DESIRED_PYTHON: "3.14t" + build_name: wheel-py3_14t-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cuda12_9-build: + wheel-py3_14t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14t" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -6769,7 +7954,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cuda12_9 + name: wheel-py3_14t-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -6787,23 +7972,23 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_9-test: # Testing + wheel-py3_14t-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-cuda12_9-build + - wheel-py3_14t-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14t" steps: - name: Display EC2 information shell: bash @@ -6873,7 +8058,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-cuda12_9 + name: wheel-py3_14t-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -6896,30 +8081,30 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_9-upload: # Uploading + wheel-py3_14t-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cuda12_9-test + needs: wheel-py3_14t-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: 12.9 + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cuda12_9 + DESIRED_PYTHON: "3.14t" + build_name: wheel-py3_14t-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-xpu-build: + wheel-py3_14t-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6928,8 +8113,8 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + DESIRED_PYTHON: "3.14t" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -7007,7 +8192,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-xpu + name: wheel-py3_14t-xpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -7025,13 +8210,13 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-xpu-test: # Testing + wheel-py3_14t-xpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-xpu-build + - wheel-py3_14t-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -7040,7 +8225,7 @@ jobs: DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.13t" + DESIRED_PYTHON: "3.14t" steps: - name: Display EC2 information shell: bash @@ -7110,7 +8295,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-xpu + name: wheel-py3_14t-xpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -7133,12 +8318,12 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-xpu-upload: # Uploading + wheel-py3_14t-xpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-xpu-test + needs: wheel-py3_14t-xpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -7146,8 +8331,8 @@ jobs: # favor of GPU_ARCH_VERSION DESIRED_CUDA: xpu GPU_ARCH_TYPE: xpu - DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-xpu + DESIRED_PYTHON: "3.14t" + build_name: wheel-py3_14t-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/h100-cutlass-backend.yml b/.github/workflows/h100-cutlass-backend.yml index 82dc2ae2a3944..edf4c2e0e807c 100644 --- a/.github/workflows/h100-cutlass-backend.yml +++ b/.github/workflows/h100-cutlass-backend.yml @@ -4,9 +4,12 @@ on: pull_request: paths: - .github/workflows/h100-cutlass-backend.yml + - torch/_inductor/codegen/cuda/** + - test/inductor/test_cutlass_backend.py + - test/inductor/test_cutlass_evt.py workflow_dispatch: schedule: - - cron: 22 9 * * * # every 24 hours about 2:22am PDT + - cron: 22 9,21 * * * # every 12 hours push: tags: - ciflow/h100-cutlass-backend/* diff --git a/.github/workflows/inductor-micro-benchmark-x86.yml b/.github/workflows/inductor-micro-benchmark-x86.yml index 117183428abc1..c6cc075e6b270 100644 --- a/.github/workflows/inductor-micro-benchmark-x86.yml +++ b/.github/workflows/inductor-micro-benchmark-x86.yml @@ -18,13 +18,13 @@ permissions: contents: read jobs: - linux-jammy-cpu-py3_9-gcc11-inductor-build: + inductor-build: if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} - name: linux-jammy-cpu-py3.9-gcc11-inductor + name: inductor-build uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-jammy-py3.9-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks # Use metal host for benchmark jobs test-matrix: | { include: [ @@ -32,13 +32,13 @@ jobs: ]} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-micro-benchmark-test: + name: inductor-micro-benchmark-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: inductor-build with: build-environment: linux-jammy-py3.9-gcc11 - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 secrets: inherit diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml index c17a4ed6341aa..fe0f102406b6a 100644 --- a/.github/workflows/inductor-nightly.yml +++ b/.github/workflows/inductor-nightly.yml @@ -32,13 +32,13 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build: - name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks + nightly-dynamo-benchmarks-build: + name: nightly-dynamo-benchmarks-build uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | { include: [ @@ -51,13 +51,13 @@ jobs: build-additional-packages: "vision audio torchao" secrets: inherit - linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test: - name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks + nightly-dynamo-benchmarks-test: + name: nightly-dynamo-benchmarks-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build + needs: nightly-dynamo-benchmarks-build with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }} + docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }} timeout-minutes: 720 secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml index 2b59777aae8c7..41210f89c9a89 100644 --- a/.github/workflows/inductor-perf-test-nightly-h100.yml +++ b/.github/workflows/inductor-perf-test-nightly-h100.yml @@ -58,9 +58,14 @@ on: required: false type: string default: inductor_huggingface_perf_cuda_h100,inductor_timm_perf_cuda_h100,inductor_torchbench_perf_cuda_h100 + pull_request: + # Changing these files guarantees that this workflow needs to be run + paths: + - .github/workflows/inductor-perf-test-nightly-h100.yml + - .ci/docker/ci_commit_pins/huggingface-requirements.txt concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true permissions: @@ -79,9 +84,8 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - # NB: Keep this in sync with trunk.yml build: - name: cuda12.8-py3.10-gcc9-sm90 + name: build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -123,7 +127,7 @@ jobs: secrets: inherit test-periodically: - name: cuda12.8-py3.10-gcc9-sm90 + name: test-periodically uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '15 0,12 * * 1-6' @@ -140,7 +144,7 @@ jobs: secrets: inherit test-weekly: - name: cuda12.8-py3.10-gcc9-sm90 + name: test-weekly uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '0 7 * * 0' @@ -157,13 +161,15 @@ jobs: secrets: inherit test: - name: cuda12.8-py3.10-gcc9-sm90 + name: test uses: ./.github/workflows/_linux-test.yml needs: build - if: github.event_name == 'workflow_dispatch' + # The pull_request trigger is used in PR to bump transformers pin which always + # needs one round of benchmark + if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }} with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 - dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} + dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }} docker-image: ${{ needs.build.outputs.docker-image }} test-matrix: ${{ needs.build.outputs.test-matrix }} timeout-minutes: 720 diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml index 0d92455a8f3c7..c3b9a42299247 100644 --- a/.github/workflows/inductor-perf-test-nightly-macos.yml +++ b/.github/workflows/inductor-perf-test-nightly-macos.yml @@ -48,6 +48,9 @@ jobs: { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" }, { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" }, { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" }, + { config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" }, + { config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" }, + { config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" }, ]} secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm.yml index 377f6d04bc8ce..f329fe74e6b64 100644 --- a/.github/workflows/inductor-perf-test-nightly-rocm.yml +++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml @@ -85,26 +85,26 @@ jobs: uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-jammy-rocm-py3_10 - docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks test-matrix: | { include: [ - { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, ]} secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml index 6e19130a19246..170de752ab875 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml @@ -69,14 +69,14 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-zen-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" }, @@ -95,16 +95,16 @@ jobs: selected-test-configs: ${{ inputs.benchmark_configs }} secrets: inherit - linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly: - name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + inductor-test-nightly: + name: inductor-test-nightly uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build + needs: inductor-build if: github.event.schedule == '0 7 * * *' with: build-environment: linux-jammy-py3.9-gcc11-build dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true - docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false @@ -112,17 +112,16 @@ jobs: monitor-data-collect-interval: 4 secrets: inherit - - linux-jammy-zen-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + inductor-test: + name: inductor-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build + needs: inductor-build if: github.event_name == 'workflow_dispatch' with: build-environment: linux-jammy-py3.9-gcc11-build dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }} - docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml index 62234e5f499a7..f894b8fdc6e03 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86.yml @@ -74,14 +74,14 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" }, @@ -101,16 +101,16 @@ jobs: build-additional-packages: "vision audio torchao" secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-test-nightly-freezing: + name: inductor-test-nightly-freezing uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: inductor-build if: github.event.schedule == '0 7 * * *' with: build-environment: linux-jammy-py3.9-gcc11-build dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false @@ -118,16 +118,16 @@ jobs: monitor-data-collect-interval: 4 secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-test: + name: inductor-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: inductor-build if: github.event_name == 'workflow_dispatch' with: build-environment: linux-jammy-py3.9-gcc11-build dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }} - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml index 9fd81a5a05c9a..19f72ba453414 100644 --- a/.github/workflows/inductor-perf-test-nightly.yml +++ b/.github/workflows/inductor-perf-test-nightly.yml @@ -79,7 +79,6 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - # NB: Keep this in sync with trunk.yml build: name: cuda12.8-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-build.yml diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml index db6a235b8c864..21d965eaeaada 100644 --- a/.github/workflows/inductor-periodic.yml +++ b/.github/workflows/inductor-periodic.yml @@ -31,8 +31,8 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build: - name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks + periodic-dynamo-benchmarks-build: + name: periodic-dynamo-benchmarks-build uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: @@ -57,63 +57,73 @@ jobs: { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, ]} build-additional-packages: "vision audio fbgemm torchao" secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test: - name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks + periodic-dynamo-benchmarks-test: + name: periodic-dynamo-benchmarks-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build + needs: periodic-dynamo-benchmarks-build with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }} + docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }} secrets: inherit - linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build: + rocm-periodic-dynamo-benchmarks-build: if: github.repository_owner == 'pytorch' - name: rocm-py3_10-periodic-dynamo-benchmarks + name: rocm-periodic-dynamo-benchmarks-build uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-jammy-rocm-py3_10 - docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks sync-tag: rocm-build test-matrix: | { include: [ - { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, ]} secrets: inherit - linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test: + rocm-periodic-dynamo-benchmarks-test: permissions: id-token: write contents: read - name: rocm-py3_10-periodic-dynamo-benchmarks + name: rocm-periodic-dynamo-benchmarks-test uses: ./.github/workflows/_rocm-test.yml - needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build + needs: rocm-periodic-dynamo-benchmarks-build with: build-environment: linux-jammy-rocm-py3_10 - docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }} + docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build: - name: cuda12.8-py3.10-gcc9-sm80 + inductor-smoke-build: + name: inductor-smoke-build uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix @@ -129,23 +139,23 @@ jobs: build-additional-packages: "vision audio fbgemm torchao" secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test: - name: cuda12.8-py3.10-gcc9-sm80 + inductor-smoke-test: + name: inductor-smoke-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build + needs: inductor-smoke-build with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build: - name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks + periodic-dynamo-benchmarks-cpu-build: + name: periodic-dynamo-benchmarks-cpu-build uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | { include: [ @@ -160,68 +170,6 @@ jobs: { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" }, { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" }, { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" }, - ]} - build-additional-packages: "vision audio torchao" - secrets: inherit - - linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test: - name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build - with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }} - secrets: inherit - - - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: - name: cuda12.8-py3.10-gcc9-sm86 - uses: ./.github/workflows/_linux-build.yml - needs: get-default-label-prefix - with: - build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks - cuda-arch-list: '8.6' - runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" - sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build - test-matrix: | - { include: [ - { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - ]} - build-additional-packages: "vision audio fbgemm torchao" - secrets: inherit - - linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: - name: cuda12.8-py3.10-gcc9-sm86 - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build - with: - build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} - secrets: inherit - - linux-jammy-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-cpu-py3.9-gcc11-inductor - uses: ./.github/workflows/_linux-build.yml - needs: get-default-label-prefix - with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks - runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" - sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build - test-matrix: | - { include: [ { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" }, { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" }, @@ -247,12 +195,12 @@ jobs: build-additional-packages: "vision audio torchao" secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor + periodic-dynamo-benchmarks-cpu-test: + name: periodic-dynamo-benchmarks-cpu-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: periodic-dynamo-benchmarks-cpu-build with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml index f4c81ce7d7b8d..732ec7eb85f3e 100644 --- a/.github/workflows/inductor-rocm-mi300.yml +++ b/.github/workflows/inductor-rocm-mi300.yml @@ -47,8 +47,8 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 test-matrix: | { include: [ - { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, ]} secrets: inherit diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml index df918c329dd77..2125a8559363b 100644 --- a/.github/workflows/inductor-unittest.yml +++ b/.github/workflows/inductor-unittest.yml @@ -28,8 +28,8 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: - name: cuda12.8-py3.10-gcc9-sm86 + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -47,44 +47,18 @@ jobs: ]} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: - name: cuda12.8-py3.10-gcc9-sm86 + inductor-test: + name: inductor-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + needs: inductor-build with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cuda12_8-py3_12-gcc9-inductor-build: - name: cuda12.8-py3.12-gcc9-sm86 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks - cuda-arch-list: '8.6' - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - test-matrix: | - { include: [ - { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_8-py3_12-gcc9-inductor-test: - name: cuda12.8-py3.12-gcc9-sm86 - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build - with: - build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }} - secrets: inherit - - linux-jammy-cpu-py3_12-inductor-halide-build: - name: linux-jammy-cpu-py3.12-gcc11-inductor-halide + inductor-halide-build: + name: inductor-halide-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -97,18 +71,18 @@ jobs: ]} secrets: inherit - linux-jammy-cpu-py3_12-inductor-halide-test: - name: linux-jammy-cpu-py3.12-gcc11-inductor-halide + inductor-halide-test: + name: inductor-halide-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_12-inductor-halide-build + needs: inductor-halide-build with: build-environment: linux-jammy-py3.12-gcc11 - docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cpu-py3_12-inductor-triton-cpu-build: - name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu + inductor-triton-cpu-build: + name: inductor-triton-cpu-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -121,23 +95,23 @@ jobs: ]} secrets: inherit - linux-jammy-cpu-py3_12-inductor-triton-cpu-test: + inductor-triton-cpu-test: name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build + needs: inductor-triton-cpu-build with: build-environment: linux-jammy-py3.12-gcc11 - docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-cpu-build: + name: inductor-cpu-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ @@ -148,37 +122,12 @@ jobs: ]} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-cpu-test: + name: inductor-cpu-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: inductor-cpu-build with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} - secrets: inherit - - linux-jammy-cuda12_8-py3_13-gcc9-inductor-build: - name: cuda12.8-py3.13-gcc9-sm86 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks - cuda-arch-list: '8.6' - test-matrix: | - { include: [ - { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_8-py3_13-gcc9-inductor-test: - name: cuda12.8-py3.13-gcc9-sm86 - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build - with: - build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index 721572f1807ba..4189d24a7b14f 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -44,8 +44,8 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: - name: cuda12.8-py3.10-gcc9-sm86 + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -53,7 +53,6 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks cuda-arch-list: '8.6' runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build test-matrix: | { include: [ { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, @@ -65,25 +64,24 @@ jobs: build-additional-packages: "vision audio fbgemm torchao" secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: - name: cuda12.8-py3.10-gcc9-sm86 + inductor-test: + name: inductor-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + needs: inductor-build with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-cpu-build: + name: inductor-cpu-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build test-matrix: | { include: [ { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, @@ -98,12 +96,12 @@ jobs: build-additional-packages: "vision audio torchao" secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-cpu-test: + name: inductor-cpu-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: inductor-cpu-build with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 476195ab5eec7..b1a6dfb390711 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -93,7 +93,7 @@ jobs: script: | CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}" echo "Running mypy" - ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh + ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh lintrunner-noclang: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main @@ -111,9 +111,9 @@ jobs: CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}" echo "Running all other linters" if [ "$CHANGED_FILES" = '*' ]; then - ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh + ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh else - ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh + ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh fi quick-checks: diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 2acc987e523c4..65b8781be7585 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -42,8 +42,8 @@ jobs: needs: get-label-type with: runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" - build-environment: linux-jammy-py3.9-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 secrets: inherit docs-push: diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml index 16cb1600b8d6b..aaf32c160f0dc 100644 --- a/.github/workflows/operator_benchmark.yml +++ b/.github/workflows/operator_benchmark.yml @@ -24,38 +24,38 @@ permissions: contents: read jobs: - linux-jammy-cpu-py3_9-gcc11-opbenchmark-build: + opbenchmark-build: if: github.repository_owner == 'pytorch' - name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + name: opbenchmark-build uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, ]} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build: + opbenchmark-on-demand-build: if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }} - name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + name: opbenchmark-on-demand-build uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, ]} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-opbenchmark-test: - name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + opbenchmark-test: + name: opbenchmark-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build + needs: opbenchmark-build with: build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }} + docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 976fb241c99f9..714838eb84762 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -51,37 +51,6 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - linux-jammy-cuda12_4-py3_10-gcc11-sm89-build: - name: linux-jammy-cuda12.4-py3.10-gcc11-sm89 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11 - cuda-arch-list: 8.9 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_4-py3_10-gcc11-sm89-test: - name: linux-jammy-cuda12.4-py3.10-gcc11-sm89 - uses: ./.github/workflows/_linux-test.yml - needs: - - linux-jammy-cuda12_4-py3_10-gcc11-sm89-build - - target-determination - with: - build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89 - docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }} - secrets: inherit - linux-jammy-cuda12_4-py3_10-gcc11-build: name: linux-jammy-cuda12.4-py3.10-gcc11 uses: ./.github/workflows/_linux-build.yml @@ -201,6 +170,38 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }} secrets: inherit + linux-jammy-cuda13_0-py3_10-gcc11-build: + name: linux-jammy-cuda13.0-py3.10-gcc11 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + cuda-arch-list: 7.5 + build-environment: linux-jammy-cuda13.0-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11 + test-matrix: | + { include: [ + { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda13_0-py3_10-gcc11-test: + name: linux-jammy-cuda13.0-py3.10-gcc11 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda13_0-py3_10-gcc11-build + - target-determination + with: + build-environment: linux-jammy-cuda13.0-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }} + secrets: inherit + linux-jammy-rocm-py3_10-build: name: linux-jammy-rocm-py3.10 uses: ./.github/workflows/_linux-build.yml diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 519a1a870b16f..3f13fbf276882 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -49,14 +49,14 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} - linux-jammy-py3_9-gcc11-build: - name: linux-jammy-py3.9-gcc11 + linux-jammy-py3_10-gcc11-build: + name: linux-jammy-py3.10-gcc11 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, @@ -73,49 +73,49 @@ jobs: ]} secrets: inherit - linux-jammy-py3_9-gcc11-test: - name: linux-jammy-py3.9-gcc11 + linux-jammy-py3_10-gcc11-test: + name: linux-jammy-py3.10-gcc11 uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-py3_9-gcc11-build + - linux-jammy-py3_10-gcc11-build - target-determination with: - build-environment: linux-jammy-py3.9-gcc11 - docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }} secrets: inherit linux-docs: name: linux-docs uses: ./.github/workflows/_docs.yml - needs: linux-jammy-py3_9-gcc11-build + needs: linux-jammy-py3_10-gcc11-build with: - build-environment: linux-jammy-py3.9-gcc11 - docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }} + build-environment: linux-jammy-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }} secrets: inherit - linux-jammy-py3_9-gcc11-no-ops: - name: linux-jammy-py3.9-gcc11-no-ops + linux-jammy-py3_10-gcc11-no-ops: + name: linux-jammy-py3.10-gcc11-no-ops uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-no-ops - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11-no-ops + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, ]} secrets: inherit - linux-jammy-py3_9-gcc11-pch: - name: linux-jammy-py3.9-gcc11-pch + linux-jammy-py3_10-gcc11-pch: + name: linux-jammy-py3.10-gcc11-pch uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-pch - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11-pch + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, @@ -132,17 +132,17 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, ]} sync-tag: asan-build secrets: inherit - linux-jammy-py3_10-clang18-asan-test: name: linux-jammy-py3.10-clang18-asan uses: ./.github/workflows/_linux-test.yml @@ -156,13 +156,13 @@ jobs: sync-tag: asan-test secrets: inherit - linux-jammy-py3_9-clang12-onnx-build: - name: linux-jammy-py3.9-clang12-onnx + linux-jammy-py3_10-clang12-onnx-build: + name: linux-jammy-py3.10-clang12-onnx uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-clang12-onnx + build-environment: linux-jammy-py3.10-clang12-onnx docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx test-matrix: | { include: [ @@ -171,26 +171,26 @@ jobs: ]} secrets: inherit - linux-jammy-py3_9-clang12-onnx-test: - name: linux-jammy-py3.9-clang12-onnx + linux-jammy-py3_10-clang12-onnx-test: + name: linux-jammy-py3.10-clang12-onnx uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-py3_9-clang12-onnx-build + - linux-jammy-py3_10-clang12-onnx-build - target-determination with: - build-environment: linux-jammy-py3.9-clang12-onnx - docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-clang12-onnx + docker-image: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }} secrets: inherit - linux-jammy-py3_9-clang12-build: - name: linux-jammy-py3.9-clang12 + linux-jammy-py3_10-clang12-build: + name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-clang12 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12 + build-environment: linux-jammy-py3.10-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, @@ -207,16 +207,16 @@ jobs: ]} secrets: inherit - linux-jammy-py3_9-clang12-test: - name: linux-jammy-py3.9-clang12 + linux-jammy-py3_10-clang12-test: + name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-py3_9-clang12-build + - linux-jammy-py3_10-clang12-build - target-determination with: - build-environment: linux-jammy-py3.9-clang12 - docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-clang12 + docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }} secrets: inherit linux-jammy-py3_13-clang12-build: @@ -251,108 +251,22 @@ jobs: build-environment: linux-jammy-py3.13-clang12 docker-image: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }} - timeout-minutes: 600 - secrets: inherit - - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed: - name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 - cuda-arch-list: '7.5' - test-matrix: | - { include: [ - { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, - { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, - { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_8-py3_10-gcc11-test-distributed: - name: linux-jammy-cuda12.8-py3.10-gcc11-test - uses: ./.github/workflows/_linux-test.yml - needs: - - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed - - target-determination - with: - timeout-minutes: 360 - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc11-build: - name: linux-jammy-cuda12.8-py3.10-gcc11 + linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build: + name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.10-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_8-py3_10-gcc11-test: - name: linux-jammy-cuda12.8-py3.10-gcc11 - uses: ./.github/workflows/_linux-test.yml - needs: - - linux-jammy-cuda12_8-py3_10-gcc11-build - - target-determination - with: - timeout-minutes: 360 - build-environment: linux-jammy-cuda12.8-py3.10-gcc11 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} - secrets: inherit - - linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build: - name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12 + build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, ]} secrets: inherit - linux-jammy-py3_9-clang9-xla-build: - name: linux-jammy-py3_9-clang9-xla - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-clang9-xla - docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite - test-matrix: | - { include: [ - { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" }, - ]} - secrets: inherit - - linux-jammy-py3_9-clang9-xla-test: - name: linux-jammy-py3_9-clang9-xla - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-py3_9-clang9-xla-build - with: - build-environment: linux-jammy-py3.9-clang9-xla - docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }} - secrets: inherit - linux-jammy-cpu-py3_10-gcc11-bazel-test: name: linux-jammy-cpu-py3.10-gcc11-bazel-test uses: ./.github/workflows/_bazel-build-test.yml @@ -368,14 +282,14 @@ jobs: ]} secrets: inherit - linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build: - name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build + linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build: + name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 build-generates-artifacts: false test-matrix: | { include: [ @@ -402,37 +316,6 @@ jobs: ]} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build: - name: linux-jammy-cuda12.8-py3.10-gcc11-sm89 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 - cuda-arch-list: 8.9 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_8-py3_10-gcc11-sm89-test: - name: linux-jammy-cuda12.8-py3.10-gcc11-sm89 - uses: ./.github/workflows/_linux-test.yml - needs: - - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build - - target-determination - with: - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }} - secrets: inherit - linux-jammy-py3-clang12-executorch-build: if: false # Docker build needs pin update name: linux-jammy-py3-clang12-executorch @@ -484,15 +367,15 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} secrets: inherit - linux-jammy-xpu-2025_1-py3_9-build: - name: linux-jammy-xpu-2025.1-py3.9 + linux-jammy-xpu-n-py3_9-build: + name: linux-jammy-xpu-n-py3.9 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - sync-tag: linux-xpu-2025-1-build + sync-tag: linux-xpu-n-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-2025.1-py3.9 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3 + build-environment: linux-jammy-xpu-n-py3.9 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" }, diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml new file mode 100644 index 0000000000000..e4ec656fafcc3 --- /dev/null +++ b/.github/workflows/riscv64.yml @@ -0,0 +1,24 @@ +name: riscv64 + +on: + push: + tags: + - ciflow/riscv64/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +permissions: read-all + +jobs: + pytorch-linux-noble-riscv64-py3_12-gcc14-cross-build: + if: github.repository_owner == 'pytorch' + name: pytorch-linux-noble-riscv64-py3_12-gcc14-cross-build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-noble-riscv64-py3.12-gcc14 + docker-image-name: pytorch-linux-noble-riscv64-py3.12-gcc14 + runner: linux.2xlarge + secrets: inherit diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml index c51d89e5c955d..7e3ba43bf9845 100644 --- a/.github/workflows/rocm-mi300.yml +++ b/.github/workflows/rocm-mi300.yml @@ -48,12 +48,12 @@ jobs: sync-tag: rocm-build test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, - { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" }, ]} secrets: inherit diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index 2a7b1d184330b..19b402f854572 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -78,14 +78,14 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }} secrets: inherit - linux-jammy-py3_9-clang12-build: - name: linux-jammy-py3.9-clang12 + linux-jammy-py3_10-clang12-build: + name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-clang12 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12 + build-environment: linux-jammy-py3.10-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12 test-matrix: | { include: [ { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, @@ -93,16 +93,16 @@ jobs: ]} secrets: inherit - linux-jammy-py3_9-clang12-test: - name: linux-jammy-py3.9-clang12 + linux-jammy-py3_10-clang12-test: + name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-py3_9-clang12-build + - linux-jammy-py3_10-clang12-build - target-determination with: - build-environment: linux-jammy-py3.9-clang12 - docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-clang12 + docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }} secrets: inherit linux-jammy-rocm-py3_10-build: diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml index 0d31948f196a1..5f0ad59d3a3bb 100644 --- a/.github/workflows/test-check-binary.yml +++ b/.github/workflows/test-check-binary.yml @@ -30,7 +30,7 @@ jobs: name: Test check_binary.sh for Linux CUDA uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: - runner: linux.4xlarge.nvidia.gpu + runner: linux.g4dn.4xlarge.nvidia.gpu docker-image: python:3.11 docker-build-dir: "skip-docker-build" script: | diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml index 7e4a818c3528d..1e83c7b9d98ce 100644 --- a/.github/workflows/test-h100.yml +++ b/.github/workflows/test-h100.yml @@ -4,6 +4,10 @@ on: pull_request: paths: - .github/workflows/test-h100.yml + - test/inductor/test_max_autotune.py + - torch/_inductor/kernel/mm.py + - torch/_inductor/kernel/mm_grouped.py + workflow_dispatch: schedule: - cron: 0 4,10,16,22 * * * # every 6 hours diff --git a/.github/workflows/tools-unit-tests.yml b/.github/workflows/tools-unit-tests.yml new file mode 100644 index 0000000000000..c687c07b7ca7e --- /dev/null +++ b/.github/workflows/tools-unit-tests.yml @@ -0,0 +1,70 @@ +name: test-scripts-and-ci-tools + +on: + push: + branches: + - main + paths: + - scripts/lumen_cli/** + - .github/workflows/tools-unit-tests.yml + pull_request: + paths: + - scripts/lumen_cli/** + - .github/workflows/tools-unit-tests.yml + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + lumen-cli-unit-tests-python312: + permissions: + contents: read + pull-requests: write + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: ubuntu-latest + steps: + - name: Checkout pytorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: true + fetch-depth: 0 + - name: Setup Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: '3.12' + cache: pip + + - name: Run tests + continue-on-error: true + run: | + set -ex + python3 -m venv /tmp/venv + source /tmp/venv/bin/activate + pip install -e .ci/lumen_cli/ + pytest -v -s .ci/lumen_cli/tests/* + + lumen-cli-compatible-python39: + permissions: + contents: read + pull-requests: write + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: ubuntu-latest + steps: + - name: Checkout pytorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: true + fetch-depth: 0 + - name: Setup Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: '3.9' + cache: 'pip' + - name: Run tests + continue-on-error: true + run: | + set -ex + python3 -m venv /tmp/venv + source /tmp/venv/bin/activate + pip install -e .ci/lumen_cli/ diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml index c656c16e97c2e..08fcd33402625 100644 --- a/.github/workflows/torchbench.yml +++ b/.github/workflows/torchbench.yml @@ -10,6 +10,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +permissions: + id-token: write + contents: read + jobs: get-default-label-prefix: if: github.repository_owner == 'pytorch' diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 3879b62cc020e..4dd465d70803d 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -63,6 +63,43 @@ jobs: ]} secrets: inherit + linux-jammy-cuda12_8-py3_10-gcc11-build: + name: linux-jammy-cuda12.8-py3.10-gcc11 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-cuda12.8-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '7.5 8.9' + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, + { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, + { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, + { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-test: + name: linux-jammy-cuda12.8-py3.10-gcc11 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-build + - target-determination + with: + timeout-minutes: 360 + build-environment: linux-jammy-cuda12.8-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} + secrets: inherit + + # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build: name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops @@ -164,9 +201,9 @@ jobs: sync-tag: rocm-build test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" }, - { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.4" }, + { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" }, ]} secrets: inherit @@ -187,13 +224,12 @@ jobs: tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl" secrets: inherit - # NB: Keep this in sync with inductor-perf-test-nightly.yml - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: - name: cuda12.8-py3.10-gcc9-sm80 + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks cuda-arch-list: '8.0' secrets: inherit @@ -205,7 +241,7 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.9-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml index 1fdb1da67a595..5c456c607c887 100644 --- a/.github/workflows/trymerge.yml +++ b/.github/workflows/trymerge.yml @@ -59,22 +59,19 @@ jobs: # on the PR appear in chronological order (timing issues can shuffle them around) sleep 60 fi + + # Require a comment id for merge operations + if [ -z "${COMMENT_ID}" ]; then + echo "Error: merge requires COMMENT_ID to be specified" + exit 1 + fi + if [ -n "${FORCE}" ]; then - if [ -n "${COMMENT_ID}" ]; then - python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}" - else - python3 .github/scripts/trymerge.py --force "${PR_NUM}" - fi + python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}" elif [ -n "${IGNORE_CURRENT}" ]; then - if [ -n "${COMMENT_ID}" ]; then - python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}" - else - python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}" - fi - elif [ -n "${COMMENT_ID}" ]; then - python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}" + python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}" else - python3 .github/scripts/trymerge.py "${PR_NUM}" + python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}" fi - name: Comment on Canceled if: ${{ cancelled() && steps.checkout.outcome == 'success' }} diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml index 08ae920e7cb0d..7f0fe6058bd08 100644 --- a/.github/workflows/unstable.yml +++ b/.github/workflows/unstable.yml @@ -12,7 +12,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true -permissions: read-all +permissions: + id-token: write + contents: read jobs: # There must be at least one job here to satisfy GitHub action workflow syntax @@ -51,3 +53,27 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} + + linux-jammy-py3_9-clang9-xla-build: + name: linux-jammy-py3_9-clang9-xla + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py3.9-clang9-xla + docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite + test-matrix: | + { include: [ + { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" }, + ]} + secrets: inherit + + linux-jammy-py3_9-clang9-xla-test: + name: linux-jammy-py3_9-clang9-xla + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-py3_9-clang9-xla-build + with: + build-environment: linux-jammy-py3.9-clang9-xla + docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index 3d445756f7a2e..aa12cf22b246c 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -23,7 +23,7 @@ jobs: with: repository: pytorch/pytorch stable-branch: viable/strict - requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]' + requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]' secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }} clickhouse-url: ${{ secrets.CLICKHOUSE_URL }} clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }} diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml new file mode 100644 index 0000000000000..654e88be386b6 --- /dev/null +++ b/.github/workflows/vllm.yml @@ -0,0 +1,76 @@ +name: vllm-test + +on: + push: + branches: + - main + - release/* + tags: + - ciflow/vllm/* + workflow_dispatch: + schedule: + - cron: '0 */8 * * *' # every 8 hours at minute 0 (UTC) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + opt_out_experiments: lf + + torch-build: + name: ci-vllm-test + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-additional-packages: "vision audio" + build-external-packages: "vllm" + build-environment: linux-jammy-cuda12.8-py3.12-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm + cuda-arch-list: '8.0;8.9;9.0' + runner: linux.24xlarge.memory + test-matrix: | + { include: [ + { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_multi_model_processor_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_lora_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_multi_model_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"}, + { config: "vllm_languagde_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"}, + { config: "vllm_distributed_test_2_gpu_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_lora_test", shard: 2, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_lora_test", shard: 3, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"}, + { config: "vllm_distributed_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"} + ]} + secrets: inherit + + vllm-test-sm89: + name: ci-vllm-test + uses: ./.github/workflows/_linux-test.yml + needs: [ + torch-build, + ] + with: + build-environment: linux-jammy-cuda12.8-py3.12-gcc11 + docker-image: ${{ needs.torch-build.outputs.docker-image }} + test-matrix: ${{ needs.torch-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/win-arm64-build-test.yml b/.github/workflows/win-arm64-build-test.yml index 627a43b56bf70..95b4e2f027f60 100644 --- a/.github/workflows/win-arm64-build-test.yml +++ b/.github/workflows/win-arm64-build-test.yml @@ -4,6 +4,9 @@ on: push: tags: - ciflow/win-arm64/* + schedule: + # Every 4 hours starting at 00:00 UTC + - cron: '0 */4 * * *' env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml index c62918b4af210..36ba62349f28b 100644 --- a/.github/workflows/xpu.yml +++ b/.github/workflows/xpu.yml @@ -26,15 +26,15 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - linux-jammy-xpu-2025_0-py3_9-build: - name: linux-jammy-xpu-2025.0-py3.9 + linux-jammy-xpu-n-1-py3_10-build: + name: linux-jammy-xpu-n-1-py3.10 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - sync-tag: linux-xpu-2025-0-build + sync-tag: linux-xpu-n-1-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-2025.0-py3.9 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3 + build-environment: linux-jammy-xpu-n-1-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3 runner: linux.12xlarge test-matrix: | { include: [ @@ -47,60 +47,62 @@ jobs: ]} secrets: inherit - linux-jammy-xpu-2025_1-py3_9-build: - name: linux-jammy-xpu-2025.1-py3.9 + linux-jammy-xpu-n-py3_10-build: + name: linux-jammy-xpu-n-py3.10 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - sync-tag: linux-xpu-2025-1-build + sync-tag: linux-xpu-n-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-2025.1-py3.9 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3 + build-environment: linux-jammy-xpu-n-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 runner: linux.12xlarge test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" }, ]} secrets: inherit - linux-jammy-xpu-2025_1-py3_9-test: - name: linux-jammy-xpu-2025.1-py3.9 + linux-jammy-xpu-n-py3_10-test: + name: linux-jammy-xpu-n-py3.10 uses: ./.github/workflows/_xpu-test.yml - needs: linux-jammy-xpu-2025_1-py3_9-build + needs: linux-jammy-xpu-n-py3_10-build permissions: id-token: write contents: read with: - build-environment: linux-jammy-xpu-2025.1-py3.9 - docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }} + build-environment: linux-jammy-xpu-n-py3.10 + docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }} secrets: inherit - windows-xpu-2025_0-build: + windows-xpu-n-1-build: if: github.repository_owner == 'pytorch' - name: win-vs2022-xpu-2025_0-py3 + name: win-vs2022-xpu-n-1-py3 uses: ./.github/workflows/_win-build.yml with: - build-environment: win-vs2022-xpu-py3 + build-environment: win-vs2022-xpu-n-1-py3 cuda-version: cpu use-xpu: true - xpu-version: '2025.0' + xpu-version: '2025.1' vc-year: '2022' secrets: inherit - windows-xpu-2025_1-build: + windows-xpu-n-build: if: github.repository_owner == 'pytorch' - name: win-vs2022-xpu-2025_1-py3 + name: win-vs2022-xpu-n-py3 uses: ./.github/workflows/_win-build.yml with: - build-environment: win-vs2022-xpu-py3 + build-environment: win-vs2022-xpu-n-py3 cuda-version: cpu use-xpu: true - xpu-version: '2025.1' + xpu-version: '2025.2' vc-year: '2022' secrets: inherit diff --git a/.gitignore b/.gitignore index b4e78e642b245..d1fa4cd3caf28 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ coverage.xml aten/build/ aten/src/ATen/Config.h aten/src/ATen/cuda/CUDAConfig.h +aten/src/ATen/hip/HIPConfig.h benchmarks/.data caffe2/cpp_test/ dist/ @@ -146,6 +147,9 @@ merge_record.json torchgen/packaged/* !torchgen/packaged/README.md +# This file is injected by ROCm build scripts to bootstrap in torch/__init__.py. +torch/_rocm_init.py + # IPython notebook checkpoints .ipynb_checkpoints diff --git a/.lintrunner.toml b/.lintrunner.toml index 9c46c91b5e353..944829fa38977 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -132,7 +132,7 @@ include_patterns = [ 'test/test_complex.py', 'test/test_datapipe.py', 'test/test_futures.py', - # 'test/test_numpy_interop.py', + 'test/test_numpy_interop.py', 'test/test_torch.py', 'test/test_type_hints.py', 'test/test_type_info.py', @@ -583,7 +583,7 @@ exclude_patterns = [ command = [ 'python3', 'tools/linter/adapters/grep_linter.py', - '--pattern=#include ', '--linter-name=PYBIND11_INCLUDE', '--match-first-only', @@ -1452,11 +1452,9 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - '--no-black-binary', - 'black==23.12.1', 'usort==1.0.8.post1', 'isort==6.0.1', - 'ruff==0.12.2', # sync with RUFF + 'ruff==0.12.9', # sync with RUFF ] is_formatter = true @@ -1591,7 +1589,7 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - 'ruff==0.12.2', # sync with PYFMT + 'ruff==0.12.9', # sync with PYFMT ] is_formatter = true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 2c67fb1981b71..0000000000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,12 +0,0 @@ -repos: - - repo: local - hooks: - - id: lintrunner - name: Run Lintrunner in an isolated venv before every push. The first run may be slow... - entry: python scripts/run_lintrunner.py # wrapper below - language: python # pre‑commit manages venv for the wrapper - additional_dependencies: [] # wrapper handles lintrunner install - always_run: true - stages: [pre-push] # fire only on pre‑push - pass_filenames: false # Lintrunner gets no per‑file args - verbose: true # stream output as it is produced...allegedly anyways diff --git a/AGENTS.md b/AGENTS.md index daf0f491702ba..3d5436a02a85d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1 +1,17 @@ - This is the only AGENTS.md, there are no recursive AGENTS.md +- When you are working on a bug, first create a standalone file that + reproduces the bug and verify it fails in the expected way. Use this to + test if your changes work. Once the change is passing, find an appropriate + test file to add the test to and make sure to follow local conventions on + the test file. +- If you are running the real test suite, DO NOT run the entire test suite. + Instead run only a single test case, e.g., 'python test/test_torch.py TestTorch.test_dir' +- Do NOT run setup.py, you do not have a working build environment +- Do NOT run pre-commit, it is not setup +- To run lint, run 'lintrunner -a' (which will autoapply changes) +- Do NOT attempt to install dependencies, you do not have Internet access +- When you are ready to make a PR, do exactly these steps: + - git stash -u + - git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch + - git stash pop + - Resolve conflicts if necessary diff --git a/BUILD.bazel b/BUILD.bazel index 50ffa12576475..2cbd36f06761b 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -22,7 +22,6 @@ COMMON_COPTS = [ "-DHAVE_SHM_UNLINK=1", "-D_FILE_OFFSET_BITS=64", "-DUSE_FBGEMM", - "-DUSE_DISTRIBUTED", "-DAT_PER_OPERATOR_HEADERS", "-DATEN_THREADING=NATIVE", "-DNO_CUDNN_DESTROY_HANDLE", @@ -279,6 +278,7 @@ header_template_rule( "@AT_BLAS_F2C@": "0", "@AT_BLAS_USE_CBLAS_DOT@": "1", "@AT_KLEIDIAI_ENABLED@": "0", + "@AT_USE_EIGEN_SPARSE@": "0", }, ) @@ -746,6 +746,7 @@ cc_library( "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu", "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu", "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp", + "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp", "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", ], )) + torch_sources, diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000000..dcdf409e73146 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,15 @@ +# Testing + +Use our test class and test runner: + +``` +from torch.testing._internal.common_utils import run_tests, TestCase + +class TestFeature(TestCase): + ... + +if __name__ == "__main__": + run_tests() +``` + +To test Tensor equality, use assertEqual. diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d3314c72814a..21c867dd6b6e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,8 +181,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)") set(CPU_POWER ON) endif() -# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not -# tested and likely won't work without additional changes. +# For non-supported platforms, turn USE_DISTRIBUTED off by default. +# NB: USE_DISTRIBUTED simply disables the backend; distributed code +# still gets built if(NOT LINUX AND NOT WIN32) set(USE_DISTRIBUTED OFF @@ -233,13 +234,16 @@ cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on" option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF) option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON) option(USE_ASAN "Use Address+Undefined Sanitizers" OFF) +option(USE_LSAN "Use Leak Sanitizer" OFF) option(USE_TSAN "Use Thread Sanitizer" OFF) option(USE_CUDA "Use CUDA" ON) option(USE_XPU "Use XPU" ON) cmake_dependent_option( BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) -cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) +cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX OR WIN32" OFF) +cmake_dependent_option(USE_ROCM_CK_GEMM "Use ROCm Composable Kernel for GEMMs" ON "USE_ROCM;NOT WIN32" OFF) +option(USE_ROCM_CK_SDPA "Use ROCm Composable Kernel for SDPA" OFF) option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF @@ -251,7 +255,6 @@ cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF) option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) option(USE_KINETO "Use Kineto profiling library" ON) option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) -option(USE_FAKELOWP "Use FakeLowp operators" OFF) option(USE_GFLAGS "Use GFLAGS" OFF) option(USE_GLOG "Use GLOG" OFF) option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF) @@ -260,16 +263,18 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF) option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF) option(USE_NATIVE_ARCH "Use -march=native" OFF) cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF) +option(USE_DISTRIBUTED "Enable default distributed backends" ON) cmake_dependent_option(USE_NCCL "Use NCCL" ON - "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) + "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) cmake_dependent_option(USE_XCCL "Use XCCL" ON - "USE_XPU;UNIX;NOT APPLE" OFF) + "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF) cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF) +cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF) cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL" OFF) cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON - "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) + "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) option(USE_NNAPI "Use NNAPI" OFF) option(USE_NNPACK "Use NNPACK" ON) cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX" @@ -286,6 +291,7 @@ option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build." option(USE_PROF "Use profiling" OFF) option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON) option(USE_SNPE "Use Qualcomm's SNPE library" OFF) +option(USE_EIGEN_SPARSE "Use Eigen Sparse Matrices" OFF) option(USE_SYSTEM_EIGEN_INSTALL "Use system Eigen instead of the one under third_party" OFF) cmake_dependent_option( @@ -322,7 +328,6 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN}) cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN" OFF) option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF) -option(USE_DISTRIBUTED "Use distributed" ON) cmake_dependent_option( USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON "USE_DISTRIBUTED" OFF) @@ -427,11 +432,10 @@ if(WIN32) PATH_SUFFIXES lib NO_DEFAULT_PATH) if(NOT libuv_tmp_LIBRARY) - set(USE_DISTRIBUTED OFF) set(USE_GLOO OFF) message( WARNING - "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. " + "Libuv is not installed in current conda env. Set USE_GLOO to OFF. " "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv." ) else() @@ -834,10 +838,11 @@ include(ExternalProject) # ---[ Dependencies ---[ FBGEMM doesn't work on x86 32bit and # CMAKE_SYSTEM_PROCESSOR thinks its 64bit -if(USE_FBGEMM - AND((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VOID_P EQUAL - 4) - OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86")) +if(USE_FBGEMM AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + message(WARNING + "x64 operating system is required for FBGEMM. " + "Not compiling with FBGEMM. " + "Turn this warning off by USE_FBGEMM=OFF.") set(USE_FBGEMM OFF) endif() @@ -1193,7 +1198,7 @@ if(APPLE) string( APPEND CMAKE_SHARED_LINKER_FLAGS - " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal" + " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal -weak_framework IOKit" ) # To suppress MPSGraph availability warnings append_cxx_flag_if_supported("-Wno-unguarded-availability-new" diff --git a/CODEOWNERS b/CODEOWNERS index 24ab4fd35be9d..1d91adacb0629 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -164,6 +164,7 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd # torch.export /torch/export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi +/torch/_export/serde/schema.py @SherlockNoMad @zhxchen17 # Dynamic Shapes /torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dade8f4ec6ec0..9d2b5d3553910 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -88,13 +88,13 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below. -* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use +* When installing with `python -m pip install -e . -v --no-build-isolation` (in contrast to `python -m pip install . -v --no-build-isolation`) Python runtime will use the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder) This way you do not need to repeatedly install after modifying Python files (`.py`). However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...). - One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac, + One way to avoid running `python -m pip install -e . -v --no-build-isolation` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac, is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following: ```bash pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd @@ -116,7 +116,7 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows Next run `python setup.py clean`. After that, you can install in editable mode again. -* If you run into errors when running `python -m pip install -e .`, here are some debugging steps: +* If you run into errors when running `python -m pip install -e . -v --no-build-isolation`, here are some debugging steps: 1. Run `printf '#include \nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure your CMake works and can compile this simple Hello World program without errors. 2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many @@ -129,10 +129,10 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows git clean -xdf python setup.py clean git submodule update --init --recursive - python -m pip install -r requirements.txt + python -m pip install --group dev python -m pip install --no-build-isolation -v -e . ``` - 4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to + 4. The main step within `python -m pip install -e . -v --no-build-isolation` is running `make` from the `build` directory. If you want to experiment with some environment variables, you can pass them into the command: ```bash ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e . @@ -259,6 +259,7 @@ dependencies as well as the nightly binaries into the repo directory. support for PyTorch. * [tools](tools) - Code generation scripts for the PyTorch library. See [README](tools/README.md) of this directory for more details. +* [torchgen](torchgen) - contains the logic and tooling for generating PyTorch's low-level C++ and Python bindings from operator definitions, typically specified in native_functions.yaml * [test](test) - Python unit tests for PyTorch Python frontend. * [test_torch.py](test/test_torch.py) - Basic tests for PyTorch functionality. @@ -294,7 +295,7 @@ The following packages should be installed with `pip`: - `pytest` - recommended to run tests more selectively Running ``` -pip install -r requirements.txt +pip install --group dev ``` will install these dependencies for you. @@ -645,9 +646,9 @@ can be selected interactively with your mouse to zoom in on a particular part of the program execution timeline. The `--native` command-line option tells `py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers for C++ code it may be necessary to compile PyTorch in debug mode by prepending -your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`. -Depending on your operating system it may also be necessary to run `py-spy` with -root privileges. +your `python -m pip install -e . -v --no-build-isolation` call to compile +PyTorch with `DEBUG=1`. Depending on your operating system it may also be +necessary to run `py-spy` with root privileges. `py-spy` can also work in an `htop`-like "live profiling" mode and can be tweaked to adjust the stack sampling rate, see the `py-spy` readme for more @@ -655,10 +656,10 @@ details. ## Managing multiple build trees -One downside to using `python -m pip install -e .` is that your development -version of PyTorch will be installed globally on your account (e.g., if -you run `import torch` anywhere else, the development version will be -used). +One downside to using `python -m pip install -e . -v --no-build-isolation` is +that your development version of PyTorch will be installed globally on your +account (e.g., if you run `import torch` anywhere else, the development version +will be used). If you want to manage multiple builds of PyTorch, you can make use of [venv environments](https://docs.python.org/3/library/venv.html) to maintain @@ -719,7 +720,7 @@ options. ### Code completion and IDE support -When using `python -m pip install -e .`, PyTorch will generate +When using `python -m pip install -e . -v --no-build-isolation`, PyTorch will generate a `compile_commands.json` file that can be used by many editors to provide command completion and error highlighting for PyTorch's C++ code. You need to `pip install ninja` to generate accurate diff --git a/README.md b/README.md index 65c0bb982bd96..99e6dabd16181 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png) +![PyTorch Logo](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/pytorch-logo-dark.png) -------------------------------------------------------------------------------- @@ -72,7 +72,7 @@ Elaborating Further: If you use NumPy, then you have used Tensors (a.k.a. ndarray). -![Tensor illustration](./docs/source/_static/img/tensor_illustration.png) +![Tensor illustration](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/tensor_illustration.png) PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the computation by a huge amount. @@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date. You get the best of speed and flexibility for your crazy research. -![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif) +![Dynamic graph](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/dynamic_graph.gif) ### Python First @@ -242,9 +242,8 @@ git submodule update --init --recursive **Common** ```bash -conda install cmake ninja -# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below -pip install -r requirements.txt +# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above +pip install --group dev ``` **On Linux** @@ -395,7 +394,7 @@ On macOS ```bash export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" -MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build +MACOSX_DEPLOYMENT_TARGET=11.0 CMAKE_ONLY=1 python setup.py build ccmake build # or cmake-gui build ``` @@ -560,7 +559,7 @@ To learn more about making a contribution to Pytorch, please see our [Contributi PyTorch is a community-driven project with several skillful engineers and researchers contributing to it. -PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means. +PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), [Alban Desmaison](https://github.com/albanD), [Piotr Bialecki](https://github.com/ptrblck) and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means. A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch. diff --git a/android/README.md b/android/README.md index 6b8000c13fccc..f0c74750522de 100644 --- a/android/README.md +++ b/android/README.md @@ -2,7 +2,7 @@ ## Demo applications and tutorials -Please refer to [pytorch-labs/executorch-examples](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch). +Please refer to [meta-pytorch/executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch). Please join our [Discord](https://discord.com/channels/1334270993966825602/1349854760299270284) for any questions. diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index b02638e5b6de7..a3c98f37a0242 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -96,6 +96,8 @@ file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp") file(GLOB vulkan_cpp "vulkan/*.cpp") file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp") +file(GLOB native_eigen_cpp "native/sparse/eigen/*.cpp") + # Metal file(GLOB metal_h "metal/*.h") file(GLOB metal_cpp "metal/*.cpp") @@ -119,6 +121,8 @@ file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp") file(GLOB_RECURSE native_mps_mm "native/mps/*.mm") file(GLOB_RECURSE native_mps_metal "native/mps/*.metal") file(GLOB_RECURSE native_mps_h "native/mps/*.h") +file(GLOB_RECURSE native_sparse_mps_mm "native/sparse/mps/*.mm") +file(GLOB_RECURSE native_mps_sparse_metal "native/sparse/mps/*.metal") file(GLOB native_sparse_cpp "native/sparse/*.cpp") file(GLOB native_quantized_cpp @@ -178,26 +182,27 @@ file(GLOB native_flash_attn_api_cpp "native/transformers/cuda/flash_attn/flash_a file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip") # if USE_FLASH_ATTENTION is set, ensure CK instances get generated if(USE_FLASH_ATTENTION) - if(DEFINED ENV{USE_CK_FLASH_ATTENTION}) - set(USE_CK_FLASH_ATTENTION $ENV{USE_CK_FLASH_ATTENTION}) - if(USE_CK_FLASH_ATTENTION STREQUAL "1") - if(DEFINED ENV{PYTORCH_ROCM_ARCH}) - list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS) - if(NUM_ARCHS GREATER 1) - message(WARNING "Building CK for multiple archs can increase build time considerably! - Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for") - endif() - endif() - message(STATUS "USE_CK_FLASH_ATTENTION is set; building PyTorch with CK Flash Attention enabled") - message(STATUS "Generating CK kernel instances...") - add_subdirectory(native/transformers/hip/flash_attn/ck) - file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip") - list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip}) - # FAv3 Generation - add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3) - file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip") - list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip}) + if("$ENV{USE_CK_FLASH_ATTENTION}" STREQUAL "1") + message(STATUS "USE_CK_FLASH_ATTENTION is being deprecated. Please use USE_ROCM_CK_SDPA instead") + caffe2_update_option(USE_ROCM_CK_SDPA ON) + endif() + if(USE_ROCM_CK_SDPA) + if(DEFINED ENV{PYTORCH_ROCM_ARCH}) + list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS) + if(NUM_ARCHS GREATER 1) + message(WARNING "Building CK for multiple archs can increase build time considerably! + Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for") endif() + endif() + message(STATUS "USE_ROCM_CK_SDPA is set; building PyTorch with CK SDPA enabled") + message(STATUS "Generating CK kernel instances...") + add_subdirectory(native/transformers/hip/flash_attn/ck) + file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip") + list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip}) + # FAv3 Generation + add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3) + file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip") + list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip}) endif() file(GLOB flash_attention_hip_aot_hip "native/transformers/hip/flash_attn/aot/*.hip") file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip") @@ -211,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)) add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp}) - target_include_directories(flash_attention PUBLIC + target_include_directories(flash_attention SYSTEM PUBLIC ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include ${PROJECT_SOURCE_DIR}/third_party/cutlass/include @@ -255,39 +260,78 @@ endif() # FBGEMM GenAI IF(USE_FBGEMM_GENAI) set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/) - set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize) - - if(USE_ROCM) - # Only include the kernels we want to build to avoid increasing binary size. - file(GLOB_RECURSE fbgemm_genai_native_rocm_hip - "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip" - "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip") - set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - - # Add additional HIPCC compiler flags for performance - set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS - -mllvm - -amdgpu-coerce-illegal-types=1 - -mllvm - -enable-post-misched=0 - -mllvm - -greedy-reverse-local-assignment=1 - -fhip-new-launch-api) + set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize) + if(USE_CUDA) + # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build. + # If you want to integrate a kernel from FBGEMM into torch, you have to add it here. + set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*") + file(GLOB_RECURSE fbgemm_genai_native_cuda_cu + "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu" + "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu") + list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX}) + + file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp + "${FBGEMM_GENAI_SRCS}/common/*.cpp" + ) + + # Combine all source files into a single list + list(APPEND fbgemm_genai_all_sources + ${fbgemm_genai_native_cuda_cu} + ${fbgemm_genai_native_cuda_cpp} + ) hip_add_library( fbgemm_genai STATIC ${fbgemm_genai_native_rocm_hip} HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) + + set(fbgemm_genai_mx8mx8bf16_grouped + "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/" + ) target_include_directories(fbgemm_genai PUBLIC - # FBGEMM version of Composable Kernel is used due to some customizations - ${FBGEMM_THIRD_PARTY}/composable_kernel/include - ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include - ${FBGEMM_GENAI_DIR}/include/ - ${FBGEMM_GENAI_DIR}/common/include/ + ${FBGEMM_THIRD_PARTY}/cutlass/include + ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include + ${fbgemm_genai_mx8mx8bf16_grouped} + ${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp + ${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h ) + else() + if(USE_ROCM) + # Only include the kernels we want to build to avoid increasing binary size. + file(GLOB_RECURSE fbgemm_genai_native_rocm_hip + "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip" + "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip") + set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + + # Add additional HIPCC compiler flags for performance + set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS + -mllvm + -amdgpu-coerce-illegal-types=1 + -mllvm + -enable-post-misched=0 + -mllvm + -greedy-reverse-local-assignment=1 + -fhip-new-launch-api) + + hip_add_library( + fbgemm_genai STATIC + ${fbgemm_genai_native_rocm_hip} + HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) + set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) + + target_include_directories(fbgemm_genai PUBLIC + # FBGEMM version of Composable Kernel is used due to some customizations + ${FBGEMM_THIRD_PARTY}/composable_kernel/include + ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include + ${FBGEMM_THIRD_PARTY}/cutlass/include + ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include + ${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp + ${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h + ) + endif() endif() endif() @@ -338,6 +382,9 @@ if(USE_VULKAN) else() set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp}) endif() +if(USE_EIGEN_SPARSE) + set(all_cpu_cpp ${all_cpu_cpp} ${native_eigen_cpp}) +endif() if(USE_MTIA) set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} ${mtia_cpp} ${mtia_h} ${native_mtia_cpp} ${native_mtia_h}) @@ -416,40 +463,42 @@ if(USE_CUDA) endif() if(USE_ROCM) - # NOTE: The PyTorch build does not actually add_subdirectory - # third_party/composable_kernel or use it as a CMake library. What is used - # is header only, so this should be ok, except that the CMake build generates - # a ck/config.h. We just do that part here. Without this, the ck.h from the - # ROCM SDK may get accidentally used instead. - function(_pytorch_rocm_generate_ck_conf) - set(CK_ENABLE_INT8 "ON") - set(CK_ENABLE_FP16 "ON") - set(CK_ENABLE_FP32 "ON") - set(CK_ENABLE_FP64 "ON") - set(CK_ENABLE_BF16 "ON") - set(CK_ENABLE_FP8 "ON") - set(CK_ENABLE_BF8 "ON") - set(CK_USE_XDL "ON") - set(CK_USE_WMMA "ON") - configure_file( - "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" - "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" - ) - endfunction() - list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip) - list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) - list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) - list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha) - list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) - list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include) - _pytorch_rocm_generate_ck_conf() + if((USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA) OR USE_ROCM_CK_GEMM) + # NOTE: The PyTorch build does not actually add_subdirectory + # third_party/composable_kernel or use it as a CMake library. What is used + # is header only, so this should be ok, except that the CMake build generates + # a ck/config.h. We just do that part here. Without this, the ck.h from the + # ROCM SDK may get accidentally used instead. + function(_pytorch_rocm_generate_ck_conf) + set(CK_ENABLE_INT8 "ON") + set(CK_ENABLE_FP16 "ON") + set(CK_ENABLE_FP32 "ON") + set(CK_ENABLE_FP64 "ON") + set(CK_ENABLE_BF16 "ON") + set(CK_ENABLE_FP8 "ON") + set(CK_ENABLE_BF8 "ON") + set(CK_USE_XDL "ON") + set(CK_USE_WMMA "ON") + configure_file( + "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" + ) + endfunction() + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include) + _pytorch_rocm_generate_ck_conf() + endif() # Next two lines are needed because TunableOp uses third-party/fmt list(APPEND ATen_HIP_INCLUDE $) list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only) -if(USE_FLASH_ATTENTION) - list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck) -endif() + if(USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck) + endif() list(APPEND ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} @@ -459,12 +508,13 @@ endif() ${native_quantized_hip_hip} ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} ) - if(WIN32) # Windows doesn't support Composable Kernels + if(NOT USE_ROCM_CK_GEMM) file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") file(GLOB native_hip_ck "native/hip/ck*.hip") exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" ${native_hip_bgemm} ${native_hip_ck}) endif() + # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) list(APPEND all_hip_cpp ${native_nested_hip_cpp} @@ -624,12 +674,26 @@ if(USE_CUDA AND NOT USE_ROCM) add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include) + + # Add FBGEMM_GENAI include directories for torch_ops.h + if(USE_FBGEMM_GENAI) + list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include) + list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include) + endif() + if($ENV{ATEN_STATIC_CUDA}) - list(APPEND ATen_CUDA_DEPENDENCY_LIBS - ${CUDA_LIBRARIES} - CUDA::cusparse_static - CUDA::cufft_static_nocallback - ) + if(CUDA_VERSION VERSION_LESS_EQUAL 12.9) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + ${CUDA_LIBRARIES} + CUDA::cusparse_static + CUDA::cufft_static_nocallback) + else() + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + ${CUDA_LIBRARIES} + CUDA::cusparse_static + CUDA::cufft_static) + endif() + if(NOT BUILD_LAZY_CUDA_LINALG) list(APPEND ATen_CUDA_DEPENDENCY_LIBS CUDA::cusolver_static @@ -699,10 +763,10 @@ endif() if(USE_MPS) include(../../../cmake/Metal.cmake) - set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h}) + set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h} ${native_sparse_mps_mm}) if(CAN_COMPILE_METAL) - foreach(SHADER ${native_mps_metal}) + foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal}) cmake_path(GET SHADER STEM TGT_STEM) string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air") list(APPEND AIR_BASIC ${TGT_BASIC}) @@ -717,7 +781,7 @@ if(USE_MPS) add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp) else() file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps") - foreach(SHADER ${native_mps_metal}) + foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal}) cmake_path(GET SHADER STEM TGT_STEM) string(CONCAT SHADER_HDR_NAME "${CMAKE_CURRENT_BINARY_DIR}" /native/mps/ ${TGT_STEM} "_metallib.h") metal_to_metallib_h(${SHADER} ${SHADER_HDR_NAME}) diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in index c22e15a52aa23..0bae6d4af6e5e 100644 --- a/aten/src/ATen/Config.h.in +++ b/aten/src/ATen/Config.h.in @@ -20,3 +20,4 @@ #define AT_BLAS_F2C() @AT_BLAS_F2C@ #define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@ #define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@ +#define AT_USE_EIGEN_SPARSE() @AT_USE_EIGEN_SPARSE@ diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 03529c64d6cac..4d48084b0ab89 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -480,6 +480,9 @@ at::BlasBackend Context::blasPreferredBackend() { // call site for blasPreferredBackend(), we set it to an actual value. if (blas_preferred_backend == at::BlasBackend::Default) { blas_preferred_backend = at::BlasBackend::Cublas; + // This logic sits in the getter because it needs to validate + // values set via env vars such as TORCH_BLAS_PREFER_CUBLASLT + // which initialize the backend without calling the setter #ifdef USE_ROCM // AMD Instinct targets prefer hipblaslt static const bool hipblaslt_preferred = []() { @@ -509,6 +512,10 @@ at::BlasBackend Context::blasPreferredBackend() { // hipblaslt support for all archs is not as complete as hipblas if (blas_preferred_backend == at::BlasBackend::Cublaslt) { static const bool hipblaslt_unsupported = []() { + if(!hasCuBLASLt()) + { + return true; + } static const std::vector archs = { "gfx90a", "gfx942", #if ROCM_VERSION >= 60300 @@ -534,6 +541,24 @@ at::BlasBackend Context::blasPreferredBackend() { return blas_preferred_backend; } +bool Context::ckSupported() { +#ifdef USE_ROCM + static const std::vector supported_archs = { + "gfx90a", "gfx942", "gfx950" + }; + for (auto index : c10::irange(detail::getCUDAHooks().deviceCount())) { + if(!detail::getCUDAHooks().isGPUArch(supported_archs, index)) { + TORCH_WARN_ONCE( + "Attempting to use CK on an unsupported architecture! Cannot set backend to CK"); + return false; + } + } + return true; +#else + return false; +#endif +} + void Context::setBlasPreferredBackend(at::BlasBackend b) { #ifdef _MSC_VER TORCH_WARN_ONCE( @@ -543,8 +568,14 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) { #else TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(), "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt."); - TORCH_CHECK((b != at::BlasBackend::Ck) || hasROCM(), - "Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm."); +#ifdef USE_ROCM + static const bool ckSupportedFlag = ckSupported(); + static const bool hasCKGEMMFlag = hasCKGEMM(); + TORCH_CHECK((b != at::BlasBackend::Ck) || (ckSupportedFlag && hasCKGEMMFlag), + "Cannot set preferred blas backend to CK since following conditions are not true: ", + "architecture supported for CK: ", ckSupportedFlag, + ", PyTorch built with CK GEMM support: ", hasCKGEMMFlag); +#endif if (b != at::BlasBackend::Default && b != at::BlasBackend::Cublas) { TORCH_WARN_ONCE( "torch.backends.cuda.preferred_blas_library is an experimental feature. " @@ -556,35 +587,40 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) { #endif } -at::ROCmFABackend Context::getROCmFAPreferredBackend() const { +at::ROCmFABackend Context::getROCmFAPreferredBackend() { +#ifdef USE_ROCM + // Set potential "Default" value so we don't have to interpret at call sites. + // We use aotriton backend as the default, for now. + if(rocm_fa_preferred_backend == at::ROCmFABackend::Default) { + rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton; + } else if (rocm_fa_preferred_backend == at::ROCmFABackend::Ck) { + // This logic sits in the getter because it needs to validate + // values set via env vars such as TORCH_ROCM_FA_PREFER_CK + // which initialize the backend without calling the setter + // Perform validity checking + static const bool hasCKSDPAFlag = hasCKSDPA(); + static const bool ckSupportedFlag = ckSupported(); + if(!(hasCKSDPAFlag && ckSupportedFlag)){ + TORCH_WARN_ONCE( + "Cannot set preferred SDPA backend to CK since following conditions are not true: ", + "architecture supported for CK: ", ckSupportedFlag, + ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag); + rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton; + } + } +#endif + return rocm_fa_preferred_backend; } void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) { - - // TODO: add plumbing for hasCK for validity checking - TORCH_CHECK((b != at::ROCmFABackend::Ck) || hasROCM(), - "Cannot set preferred flash attention backend to Ck if PyTorch has not been compiled for ROCm."); #ifdef USE_ROCM - if(b == at::ROCmFABackend::Ck) { - static const bool ck_unsupported = []() { - static const std::vector archs = { - "gfx90a", "gfx942", "gfx950" - }; - for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) { - if (!detail::getCUDAHooks().isGPUArch(archs, index)) { - TORCH_WARN_ONCE( - "Attempting to use CK on an unsupported architecture! Cannot set backend to CK"); - return true; - } - } - return false; - }(); - if(!ck_unsupported) rocm_fa_preferred_backend = b; - } - else { - rocm_fa_preferred_backend = b; - } + static const bool hasCKSDPAFlag = hasCKSDPA(); + static const bool ckSupportedFlag = ckSupported(); + TORCH_CHECK((b != at::ROCmFABackend::Ck) || (hasCKSDPAFlag && ckSupportedFlag), + "Cannot set preferred SDPA backend to CK since following conditions are not true: ", + "architecture supported for CK: ", ckSupportedFlag, + ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag); #endif rocm_fa_preferred_backend = b; } @@ -662,6 +698,14 @@ bool Context::hasLAPACK() { #endif } +bool Context::hasEigenSparse() { +#if AT_USE_EIGEN_SPARSE() + return true; +#else + return false; +#endif +} + at::QEngine Context::qEngine() const { static auto _quantized_engine = []() { at::QEngine qengine = at::kNoQEngine; diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 945076f3f0124..5cfa9b23e20aa 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -132,6 +132,8 @@ class TORCH_API Context { static bool hasKleidiAI(); static bool hasLAPACK(); static bool hasMKLDNN(); + static bool ckSupported(); + static bool hasEigenSparse(); static bool hasMAGMA() { return detail::getCUDAHooks().hasMAGMA(); } @@ -162,6 +164,12 @@ class TORCH_API Context { static bool hasROCM() { return detail::getCUDAHooks().hasROCM(); } + static bool hasCKSDPA() { + return detail::getCUDAHooks().hasCKSDPA(); + } + static bool hasCKGEMM() { + return detail::getCUDAHooks().hasCKGEMM(); + } static bool hasHIP() { return detail::getHIPHooks().hasHIP(); } @@ -252,7 +260,7 @@ class TORCH_API Context { at::BlasBackend blasPreferredBackend(); void setBlasPreferredBackend(at::BlasBackend); - at::ROCmFABackend getROCmFAPreferredBackend() const; + at::ROCmFABackend getROCmFAPreferredBackend(); void setROCmFAPreferredBackend(at::ROCmFABackend); // Note [Enabling Deterministic Operations] @@ -608,6 +616,10 @@ inline bool hasLAPACK() { return globalContext().hasLAPACK(); } +inline bool hasEigenSparse() { + return globalContext().hasEigenSparse(); +} + inline bool hasMAGMA() { return globalContext().hasMAGMA(); } diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp index 9632cd5ed6983..98ad757946bec 100644 --- a/aten/src/ATen/DLConvertor.cpp +++ b/aten/src/ATen/DLConvertor.cpp @@ -308,17 +308,44 @@ void fillVersion( // constructed out of ATen tensor template T* toDLPackImpl(const Tensor& src) { - // create a new tensor with possibly normalized strides - // gh-83069 - auto shape = src.sizes(); - auto strides = src.strides().vec(); - for (int i = 0; i < src.dim(); i++) { - if (shape[i] < 2) { - strides[i] = 1; + auto view = src; + + // Detect whether there is need to normalize the strides + // Background: gh-83069 + // + // However, normalizing strides can come at a high-cost + // to slow down toDLPack conversion 3x, so we + // only normalize if needed. + // + // The following code detects whether the src follows + // a continuous pattern. If the src follows such pattern (common-case) + // then we do not need to normalize the strides. + bool need_normalize_strides = false; + int64_t expected_stride = 1; + for (int i = src.dim() - 1; i >= 0; i--) { + // detect if we do not meet continuous pattern + // and the size is 1, so there is opportunity to normalize + if (src.stride(i) != expected_stride && src.size(i) == 1) { + need_normalize_strides = true; + break; + } + expected_stride *= src.size(i); + } + + // less common case, try normalizing the strides + if (need_normalize_strides) { + // create a new tensor with possibly normalized strides + // gh-83069 + auto shape = src.sizes(); + auto strides = src.strides().vec(); + for (int i = 0; i < src.dim(); i++) { + if (shape[i] < 2) { + strides[i] = 1; + } } + view = src.as_strided(shape, strides, src.storage_offset()); } - auto view = src.as_strided(shape, strides, src.storage_offset()); ATenDLMTensor* atDLMTensor(new ATenDLMTensor); atDLMTensor->handle = view; atDLMTensor->tensor.manager_ctx = atDLMTensor; diff --git a/aten/src/ATen/DTensorState.cpp b/aten/src/ATen/DTensorState.cpp new file mode 100644 index 0000000000000..0644aae3d0709 --- /dev/null +++ b/aten/src/ATen/DTensorState.cpp @@ -0,0 +1,17 @@ +#include + +namespace at { + +namespace { +thread_local bool kDTensorAllowImplicitReplication = false; +} + +bool get_dtensor_allow_implicit_replication() { + return kDTensorAllowImplicitReplication; +} + +void set_dtensor_allow_implicit_replication(bool enabled) { + kDTensorAllowImplicitReplication = enabled; +} + +} // namespace at diff --git a/aten/src/ATen/DTensorState.h b/aten/src/ATen/DTensorState.h new file mode 100644 index 0000000000000..07e89eaeddae7 --- /dev/null +++ b/aten/src/ATen/DTensorState.h @@ -0,0 +1,34 @@ +#pragma once + +#include + +namespace at { + +TORCH_API bool get_dtensor_allow_implicit_replication(); +TORCH_API void set_dtensor_allow_implicit_replication(bool enabled); + +struct DTensorAllowImplicitReplication { + DTensorAllowImplicitReplication() + : prev_dtensor_allow_implicit_replication_( + get_dtensor_allow_implicit_replication()) { + set_dtensor_allow_implicit_replication(true); + } + + DTensorAllowImplicitReplication(const DTensorAllowImplicitReplication&) = + delete; + DTensorAllowImplicitReplication& operator=( + const DTensorAllowImplicitReplication&) = delete; + DTensorAllowImplicitReplication(DTensorAllowImplicitReplication&&) = delete; + DTensorAllowImplicitReplication& operator=( + DTensorAllowImplicitReplication&&) = delete; + + ~DTensorAllowImplicitReplication() { + set_dtensor_allow_implicit_replication( + prev_dtensor_allow_implicit_replication_); + } + + private: + bool prev_dtensor_allow_implicit_replication_; +}; + +} // namespace at diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h index f37e492c861fe..f23b35047fcc8 100644 --- a/aten/src/ATen/DeviceAccelerator.h +++ b/aten/src/ATen/DeviceAccelerator.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -72,6 +73,27 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index); // original device index that was active before the change. TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index); +TORCH_API inline void emptyCache() { + const auto device_type = getAccelerator(true).value(); + at::getDeviceAllocator(device_type)->emptyCache(); +} + +TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats( + c10::DeviceIndex device_index) { + const auto device_type = getAccelerator(true).value(); + return at::getDeviceAllocator(device_type)->getDeviceStats(device_index); +} + +TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) { + const auto device_type = getAccelerator(true).value(); + at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index); +} + +TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) { + const auto device_type = getAccelerator(true).value(); + at::getDeviceAllocator(device_type)->resetPeakStats(device_index); +} + } // namespace at::accelerator namespace at { diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index 5634733325a2e..0e535ab20cd21 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -31,7 +31,9 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { return at::globalContext().getPinnedMemoryAllocator(opt_device_type); } else { TORCH_CHECK( - false, "Need to provide pin_memory allocator to use pin memory.") + false, + "pin_memory=True requires a CUDA or other accelerator backend; " + "no pinned memory allocator is available on this system.") } } diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 33977d8d7cf8a..22509c7be4e19 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace at { @@ -19,6 +20,7 @@ ThreadLocalState::ThreadLocalState() torch_dispatch_mode_state_(c10::impl::TorchDispatchModeTLS::get_state()), python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()), python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()), saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()), + dtensor_allow_implicit_replication_(at::get_dtensor_allow_implicit_replication()), saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) { #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER) for(size_t i=0; i>& tensor_sizes) { for (auto& sizes : tensor_sizes) { if (sizes.size() == 1) { - if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) { + if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) { continue; } } @@ -135,7 +135,7 @@ inline int64_t legacy_cat_wrap_dim( const MaterializedITensorListRef& tensors) { for (const Tensor& tensor : tensors) { if (tensor.dim() == 1) { - if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) { + if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) { continue; } } diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index afd0a6b67674a..4b8b5f6c5d187 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -216,6 +216,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) { KERNEL_MPS(_convolution, lower_precision_fp) KERNEL_MPS(conv1d, lower_precision_fp) KERNEL_MPS(conv2d, lower_precision_fp) + KERNEL_MPS(conv3d, lower_precision_fp) KERNEL_MPS(conv_tbc, lower_precision_fp) KERNEL_MPS(conv_transpose1d, lower_precision_fp) KERNEL_MPS(conv_transpose2d, input, lower_precision_fp) @@ -239,6 +240,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) { KERNEL_MPS(scaled_dot_product_attention, lower_precision_fp) // fp32 + KERNEL_MPS(conv_transpose3d, input, fp32) KERNEL_MPS(acos, fp32) KERNEL_MPS(asin, fp32) KERNEL_MPS(cosh, fp32) diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h index 5049018d731e1..53e95cd2d4cfd 100644 --- a/aten/src/ATen/core/CachingHostAllocator.h +++ b/aten/src/ATen/core/CachingHostAllocator.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -251,6 +252,7 @@ struct CachingHostAllocatorImpl { auto* block = reinterpret_cast(ctx); std::optional> events; + ska::flat_hash_set streams; { std::lock_guard g(block->mutex_); block->allocated_ = false; @@ -259,14 +261,19 @@ struct CachingHostAllocatorImpl { } else { events = std::vector(); events->reserve(block->streams_.size()); - for (auto stream : block->streams_) { - record_stream(events, stream); - } - block->event_count_ += events->size(); + block->event_count_ += block->streams_.size(); + // Move out streams to avoid holding the mutex during event recording + streams = std::move(block->streams_); block->streams_.clear(); } } + // Event recording must be done outside the mutex to avoid potential + // deadlocks (e.g., when Python GIL is involved) + for (auto stream : streams) { + record_stream(events, stream); + } + if (!events) { auto index = size_index(block->size_); std::lock_guard g(free_list_[index].mutex_); @@ -345,7 +352,8 @@ struct CachingHostAllocatorImpl { } virtual bool pinned_use_background_threads() { - return false; + return c10::CachingAllocator::AcceleratorAllocatorConfig:: + pinned_use_background_threads(); } virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const { diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index 8463379149e27..5f43738ea0faf 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -1,5 +1,18 @@ #pragma once +// See https://github.com/pytorch/pytorch/issues/161660 +// This compile flag is intended to be passed in to CppExtensions that rely on +// the stable ABI via the `extra_compile_args` argument. This is a stopgap +// solution to ensure that non-stable libtorch APIs are not used in the extension. +// The long term solution is to have a torch_stable target that excludes headers +// that are not in torch/stable or torch/headeronly. +// See test/cpp_extensions/torch_stable_test_extension/setup.py for an example +// of how this is used. +#ifdef TORCH_STABLE_ONLY +#error \ + "TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed" +#endif + #include #include #include diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h index 06bcc5d4f49b8..4300217235b84 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.h +++ b/aten/src/ATen/core/boxing/KernelFunction.h @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include namespace c10 { @@ -17,6 +19,9 @@ class OperatorHandle; struct OperatorKernel; class KernelFunction; +class KernelToken; +class SafeKernelFunction; + template using has_symint = std::disjunction< std::is_same, @@ -90,6 +95,12 @@ class TORCH_API KernelFunction final { BoxedKernel::BoxedKernelFunction_withDispatchKeys; KernelFunction(); + ~KernelFunction(); + + KernelFunction(const KernelFunction& other); + KernelFunction& operator=(const KernelFunction& other); + + KernelFunction(KernelFunction&&) noexcept = default; // Fast path for dispatch to allow not touching the boxed kernel in // the common case where unboxed is available. @@ -262,6 +273,9 @@ class TORCH_API KernelFunction final { // For testing internal invariants only bool _equalsBoxedAndUnboxed(const KernelFunction&) const; + // Register a token to be invalidated when this KernelFunction is destroyed + void registerToken(std::weak_ptr token) const; + private: explicit KernelFunction( std::unique_ptr functor, @@ -276,6 +290,50 @@ class TORCH_API KernelFunction final { BoxedKernel boxed_kernel_func_; void* unboxed_kernel_func_; void* sym_unboxed_kernel_func_; + // List of tokens that need to be invalidated when this KernelFunction is + // destroyed (lazy allocation to save memory when empty) + mutable std::unique_ptr>> tokens_; +}; + +// Token held by SafeKernelFunction that gets invalidated when KernelFunction is +// destroyed +class KernelToken { + public: + bool isValid() const; + void invalidate(); + + private: + std::atomic invalid_{false}; +}; + +class SafeKernelFunction { + public: + SafeKernelFunction( + const KernelFunction* kernel, + std::string debug, + std::shared_ptr opHandle); + + // Safe callBoxed - checks token validity first + void callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const; + + // Get debug information + const std::string& debug() const { + return debug_; + } + + // Get the OpHandle that lives on this SafeKernelFunction + const OperatorHandle& opHandle() const { + return *opHandle_; + } + + private: + KernelFunction kernel_; + std::shared_ptr token_; + std::string debug_; + std::shared_ptr opHandle_; }; } // namespace c10 diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index df49d6227ee93..672309ec19a2c 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -15,7 +15,7 @@ std::enable_if_t< std::is_base_of_v, std::unique_ptr> make_unique_base(Args&&... args) { - return std::unique_ptr(new Child(std::forward(args)...)); + return std::make_unique(std::forward(args)...); } } // namespace detail @@ -24,6 +24,36 @@ inline KernelFunction::KernelFunction() unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {} +inline KernelFunction::~KernelFunction() { + if (tokens_) { + for (auto& weak_token : *tokens_) { + if (auto token = weak_token.lock()) { + token->invalidate(); + } + } + } +} + +inline KernelFunction::KernelFunction(const KernelFunction& other) + : boxed_kernel_func_(other.boxed_kernel_func_), + unboxed_kernel_func_(other.unboxed_kernel_func_), + sym_unboxed_kernel_func_(other.sym_unboxed_kernel_func_) { + // tokens_ is intentionally not copied as we only care about invalidating + // tokens if the original KernelFunction is destroyed +} + +inline KernelFunction& KernelFunction::operator=(const KernelFunction& other) { + if (this != &other) { + boxed_kernel_func_ = other.boxed_kernel_func_; + unboxed_kernel_func_ = other.unboxed_kernel_func_; + sym_unboxed_kernel_func_ = other.sym_unboxed_kernel_func_; + + // tokens_ is intentionally not copied as we only care about invalidating + // tokens if the original KernelFunction is destroyed + } + return *this; +} + inline KernelFunction::KernelFunction( std::unique_ptr functor, InternalBoxedKernelFunction* boxed_kernel_func, @@ -157,6 +187,14 @@ C10_ALWAYS_INLINE Return KernelFunction::call( std::forward(args)...); } +inline void KernelFunction::registerToken( + std::weak_ptr token) const { + if (!tokens_) { + tokens_ = std::make_unique>>(); + } + tokens_->push_back(std::move(token)); +} + inline KernelFunction KernelFunction::makeFromBoxedKernel( BoxedKernel boxed_fn) { return KernelFunction( @@ -317,4 +355,38 @@ KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) { std::forward(lambda))); } +inline bool KernelToken::isValid() const { + return !invalid_.load(std::memory_order_acquire); +} + +inline void KernelToken::invalidate() { + invalid_.store(true, std::memory_order_release); +} + +inline SafeKernelFunction::SafeKernelFunction( + const KernelFunction* kernel, + std::string debug, + std::shared_ptr opHandle) + : kernel_(kernel ? *kernel : KernelFunction()), + token_(std::make_shared()), + debug_(std::move(debug)), + opHandle_(std::move(opHandle)) { + // Register the token with the original kernel so it gets invalidated when the + // kernel is destroyed + if (kernel) { + kernel->registerToken(token_); + } +} + +inline void SafeKernelFunction::callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const { + TORCH_CHECK( + token_ && token_->isValid(), + "SafeKernelFunction has been invalidated ", + debug_); + kernel_.callBoxed(opHandle, dispatchKeySet, stack); +} + } // namespace c10 diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index bc043df6a93e9..43eb0028c70fe 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -487,6 +487,10 @@ class TORCH_API OperatorHandle { return operatorDef_->op.hasComputedKernelForDispatchKey(k); } + SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const { + return operatorDef_->op.getComputedKernelForDispatchKey(k); + } + std::string dumpComputedTable() const { return operatorDef_->op.dumpComputedTable(); } diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index b4063fb720be0..c172e9b9c6096 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -315,6 +315,42 @@ const AnnotatedKernel* OperatorEntry::getKernelForDispatchKey(DispatchKey dispat return nullptr; } +SafeKernelFunction OperatorEntry::getComputedKernelForDispatchKey( + DispatchKey k) const { + TORCH_CHECK( + !isAliasDispatchKey(k), + "Alias keys do not have runtime kernel registrations."); + const auto dispatch_ix = getDispatchTableIndexForDispatchKey(k); + TORCH_CHECK( + dispatchTable_[dispatch_ix].isValid(), + "no kernel for ", + k, + " for ", + name_); + + // Get the KernelFunction object from kernels_ to pass to SafeKernelFunction + + // The KernelFunction object in dispatchTable_ is a copy of the KernelFunction + // in the AnnotatedKernel in kernels_. A KernelFunction is only truly + // deregistered when the kernel is removed from kernels_. However, the + // KernelFunction in dispatchTable_ might be removed before it is deregistered + // (when a newer kernel is registered). Therefore, here we want to return a + // SafeKernelFunction that is backed by the original KernelFunction in + // kernels_, so that we only invalidate it when the kernel is deregistered. + auto [annotatedKernel, _] = + computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k); + + // Use findSchemaOrThrow to get OpHandle for the OperatorEntry + auto& dispatcher = c10::Dispatcher::singleton(); + auto opHandle = dispatcher.findSchemaOrThrow( + name_.name.c_str(), name_.overload_name.c_str()); + + return SafeKernelFunction( + &annotatedKernel.kernel, + annotatedKernel.debug, + std::make_shared(opHandle)); +} + const std::vector& OperatorEntry::getTags() const { #if defined C10_MOBILE TORCH_CHECK(false, "tags are not saved for Mobile"); diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h index 83200ff9c94ff..59b54ce1d9d32 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.h +++ b/aten/src/ATen/core/dispatch/OperatorEntry.h @@ -217,6 +217,8 @@ class TORCH_API OperatorEntry final { const KernelFunction& kernelForDispatchKey(DispatchKey k) const; // Returns true if the "computed table" has an entry for a particular key. bool hasComputedKernelForDispatchKey(DispatchKey k) const; + // Returns a KernelFunction corresponding to the kernel in dispatchTable + SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const; // Returns all the operator tags added at the time of registration const std::vector& getTags() const; void setReportErrorCallback_(std::unique_ptr callback); diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h index b33e7ce0c5495..2ba841e44e202 100644 --- a/aten/src/ATen/core/dynamic_type.h +++ b/aten/src/ATen/core/dynamic_type.h @@ -64,6 +64,7 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10); _(ScalarType, kDynamicIntTypeBit, 1) \ _(Layout, kDynamicIntTypeBit, 1) \ _(SymInt, kDynamicIntTypeBit, 1) \ + _(SymBool, kDynamicIntTypeBit, 1) \ _(MemoryFormat, kDynamicIntTypeBit, 1) #define FORWARD_DECL_TYPE(NAME, _, __) struct NAME ## Type; diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index c6087f0a68ecf..72589436606ec 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -97,6 +97,8 @@ c10::TypePtr IValue::TagType::get(const IValue& v) { return ComplexType::get(); case Tag::Int: return IntType::get(); + case Tag::UInt: + return IntType::get(); case Tag::SymInt: return c10::SymIntType::get(); case Tag::SymFloat: @@ -320,6 +322,8 @@ IValue IValue::equals(const IValue& rhs) const { return rhs.isComplexDouble() && lhs.toComplexDouble() == rhs.toComplexDouble(); case Tag::Int: return rhs.isInt() && lhs.toInt() == rhs.toInt(); + case Tag::UInt: + return rhs.isUnsigned() && lhs.toUInt() == rhs.toUInt(); case Tag::SymInt: return rhs.isSymInt() && lhs.toSymInt() == rhs.toSymInt(); case Tag::SymFloat: @@ -379,6 +383,8 @@ size_t IValue::hash(const IValue& v) { case Tag::Int: return c10::get_hash(v.payload.u.as_int); // NB: these are technically strict aliasing violations + case Tag::UInt: + return c10::get_hash(v.payload.u.as_int); case Tag::SymInt: return c10::get_hash(v.payload.u.as_int); case Tag::SymFloat: @@ -806,6 +812,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) { return printComplex(out, v); } case IValue::Tag::Int: return out << v.toInt(); + case IValue::Tag::UInt: + return out << v.toUInt(); case IValue::Tag::SymInt: return out << v.toSymInt(); case IValue::Tag::SymFloat: diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 175860dc99a7c..ab2039e058201 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -160,6 +161,7 @@ struct Capsule { _(Double) \ _(ComplexDouble) \ _(Int) \ + _(UInt) \ _(SymInt) \ _(SymFloat) \ _(SymBool) \ @@ -653,6 +655,29 @@ struct TORCH_API IValue final { } } + // Unsigned + IValue(uint64_t u) : tag( u <= std::numeric_limits::max() ? Tag::Int : Tag::UInt) { + payload.u.as_uint = u; + } + + + // See Note [Meaning of HAS_u] + // IValue type model closely follows that of c10::Scalar + // Where all integers are upcast to 64-bit representation, and `as_int` is used as default + // representation unless value could not be represented as signed int + bool isUnsigned() const { + return Tag::UInt == tag || (Tag::Int == tag && payload.u.as_int >= 0); + } + + uint64_t toUInt() const { + if (isUnsigned()) { + return payload.u.as_uint; + } else { + TORCH_INTERNAL_ASSERT(0, "expected unsigned int"); + } + } + + // Bool IValue(bool b) : tag(Tag::Bool) { #if defined(__clang__) && defined(__x86_64__) @@ -893,8 +918,14 @@ struct TORCH_API IValue final { } else { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( s.isIntegral(false), "Unknown type in Scalar"); - tag = Tag::Int; - payload.u.as_int = s.toLong(); + if (s.isUnsigned()) { + const auto val = s.toUInt64(); + payload.u.as_uint = val; + tag = val <= std::numeric_limits::max() ? Tag::Int : Tag::UInt; + } else { + payload.u.as_int = s.toLong(); + tag = Tag::Int; + } } } @@ -918,6 +949,8 @@ struct TORCH_API IValue final { return toSymFloat(); else if (isSymBool()) return toSymBool(); + else if (isUnsigned()) + return toUInt(); TORCH_CHECK(false, "IValue is not a Scalar"); } @@ -1247,6 +1280,8 @@ struct TORCH_API IValue final { return true; case Tag::Int: return false; + case Tag::UInt: + return false; case Tag::SymInt: return true; case Tag::SymFloat: @@ -1343,6 +1378,8 @@ struct TORCH_API IValue final { union TriviallyCopyablePayload { TriviallyCopyablePayload() : as_int(0) {} int64_t as_int; + // See Note [Meaning of HAS_u] + uint64_t as_uint; double as_double; bool as_bool; // Invariant: never nullptr; null state is represented as diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index cf403365b2df2..0d319ea593840 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -832,7 +832,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); } } -#if defined(USE_ROCM) && !defined(_MSC_VER) +#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); } @@ -996,9 +996,6 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { - #ifdef USE_ROCM - TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm"); - #endif // TODO: Support tuning for Half inputs and FP32 output bgemm_internal(CUDABLAS_BGEMM_ARGS(at::Half)); } @@ -1006,9 +1003,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float) template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { - #ifdef USE_ROCM - TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm"); - #else + #ifndef USE_ROCM cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major < 8) @@ -1273,7 +1268,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); #endif } -#if defined(USE_ROCM) && !defined(_MSC_VER) +#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); } @@ -1289,7 +1284,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); } -#if defined(USE_ROCM) && !defined(_MSC_VER) +#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100 gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); @@ -1341,7 +1336,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); } -#if defined(USE_ROCM) && !defined(_MSC_VER) +#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); } @@ -1357,7 +1352,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); } -#if defined(USE_ROCM) && !defined(_MSC_VER) +#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); } @@ -1513,9 +1508,6 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { template <> void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { - #ifdef USE_ROCM - TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); - #endif // TODO: Support Tuning for fp16-fp32 gemm gemm_internal(CUDABLAS_GEMM_ARGS(at::Half)); } @@ -1523,9 +1515,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) template <> void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { - #ifdef USE_ROCM - TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm"); - #else + #ifndef USE_ROCM cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major < 8) @@ -1847,8 +1837,12 @@ int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fa switch (scaling_type) { case ScalingType::BlockWise1x32: TORCH_CHECK(scale_dtype == kFloat8_e8m0fnu); -#if CUDA_VERSION >= 12080 +#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000) +#ifdef USE_ROCM + return HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0; +#else return CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0; +#endif // USE_ROCM #else TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales of 1x32 blocks is only supported for CUDA 12.8 and above"); #endif // if CUDA_VERSION >= 12080 @@ -1943,15 +1937,33 @@ void scaled_gemm( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb)); cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER; cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER; +#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT) // hipblaslt supported row-wise before cublas, and did so their own way (via // the SCALE_POINTERSs), but then migrated to match how cublas does it (via // the SCALE_MODEs). Here we check for this early custom mode. -#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT) - if (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise) { + bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise); + if (use_rowwise) { matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT; matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT; } -#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT) + else if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) { + #if ROCM_VERSION >= 70000 + if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) { + // TODO: add constraints based on hipblaslt internals + TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0), + "Matrix dimensions must be multiples of 32 for MX format. " + "Got m=", m, ", n=", n, ", k=", k); + } + #endif + } +#elif (CUDA_VERSION < 12090) && !defined(USE_ROCM) + // hipblaslt supported row-wise before cublas, and did so their own way (via + // the SCALE_POINTERSs), but then migrated to match how cublas does it (via + // the SCALE_MODEs). Here we check for this early custom mode. + bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise); + // rowwise isn't supported using older cublaslt or older hipblaslt + TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt"); +#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT) computeDesc.setAttribute(matmulDescA, mat1_scale_ptr); computeDesc.setAttribute(matmulDescB, mat2_scale_ptr); if (result_scale_ptr != nullptr) { @@ -1990,15 +2002,16 @@ void scaled_gemm( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype)); } - - // The SCALE_MODE attrs only exist in cuBLAS 12.8+ or in recent hipblaslt, - // but we must invoke get_scale_mode anyways to trigger the version checks. - [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum); - [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum); -#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC)) - computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode); - computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode); -#endif + // For other data types, use the get_scale_mode function based on scaling type + // The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt, + // but we must invoke get_scale_mode anyways to trigger the version checks. + // Note that AMD/ROCm follows OCP Spec 1.0, which is different from NVIDIA's implementation. See get_scale_mode() for details. + [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum); + [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum); +#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC)) + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode); +#endif // if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC)) CuBlasLtMatmulPreference preference; auto ltworkspace = CublasLtWorkspace(); @@ -2564,8 +2577,6 @@ void vdot>(CUDABLAS_DOT_ARGTYPES(c10::complex)) { reinterpret_cast(result))); } -// HIP on Windows does not support -#if !(defined(USE_ROCM) && defined(_MSC_VER)) template <> void getrsBatched(CUDABLAS_GETRS_ARGTYPES(float)) { TORCH_CUDABLAS_CHECK(cublasSgetrsBatched( @@ -2764,6 +2775,5 @@ void gelsBatched>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::comple devInfoArray, batchSize)); } -#endif // !(defined(USE_ROCM) && defined(_MSC_VER)) } // namespace at::cuda::blas diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index 5021917fe0950..b235840418e25 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -343,9 +343,6 @@ void vdot>(CUDABLAS_DOT_ARGTYPES(c10::complex)); int m, int n, int nrhs, Dtype** dA_array, int ldda, \ Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize -// HIP on Windows does not support getrs, geqrf, getrf, gels -#if !(defined(USE_ROCM) && defined(_MSC_VER)) - template void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) { static_assert(false&&sizeof(Dtype),"at::cuda::blas::getrsBatched: not implemented"); @@ -400,28 +397,4 @@ TORCH_CUDA_CU_API void gelsBatched>(CUDABLAS_GELS_BATCHED_A template<> TORCH_CUDA_CU_API void gelsBatched>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex)); -#else // !(defined(USE_ROCM) && defined(_MSC_VER)) - -template -void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) { - TORCH_CHECK(false, "at::cuda::blas::getrsBatched: not supported for HIP on Windows"); -} - -template -void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) { - TORCH_CHECK(false, "at::cuda::blas::geqrfBatched: not supported for HIP on Windows"); -} - -template -void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) { - TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not supported for HIP on Windows"); -} - -template -void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) { - TORCH_CHECK(false, "at::cuda::blas::gelsBatched: not supported for HIP on Windows"); -} - -#endif // !(defined(USE_ROCM) && defined(_MSC_VER)) - } // namespace at::cuda::blas diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 7fba7c4c7424c..b8cd84c56daef 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include @@ -253,6 +252,13 @@ cudaGraph_t CUDAGraph::raw_cuda_graph() { return graph_; } +cudaGraphExec_t CUDAGraph::raw_cuda_graph_exec() { + TORCH_CHECK( + has_graph_exec_, + "You cannot access the raw cudaGraphExec_t instance until instantiate() has been called"); + return graph_exec_; +} + void CUDAGraph::reset() { // I'd prefer these checks throw exceptions, not print warnings, // but the destructor calls reset(), and at least one CI build diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h index c8cae16b624fe..c18ad66b20809 100644 --- a/aten/src/ATen/cuda/CUDAGraph.h +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -36,6 +37,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph { void enable_debug_mode(); void debug_dump(const std::string& debug_path); cudaGraph_t raw_cuda_graph(); + cudaGraphExec_t raw_cuda_graph_exec(); protected: cudaGraph_t graph_ = nullptr; diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp index 39fd0e16fac51..34aa15d0c06cf 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.cpp +++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp @@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl } bool pinned_use_background_threads() override { - return c10::CachingAllocator::AcceleratorAllocatorConfig:: + return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: pinned_use_background_threads(); } diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.cpp b/aten/src/ATen/cuda/PeerToPeerAccess.cpp index 91b487cd9c83e..66a75db6ea067 100644 --- a/aten/src/ATen/cuda/PeerToPeerAccess.cpp +++ b/aten/src/ATen/cuda/PeerToPeerAccess.cpp @@ -4,6 +4,9 @@ #include #include +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) +#include +#endif #include #include @@ -12,6 +15,7 @@ namespace at::cuda { static std::vector p2pAccessEnabled_; +static std::vector fabricAccessEnabled_; static int64_t num_devices_ = -1; namespace detail { @@ -29,20 +33,23 @@ void init_p2p_access_cache(int64_t num_devices) { for (const auto i : c10::irange(num_devices)) { p2pAccessEnabled_[i * num_devices + i] = 1; } + fabricAccessEnabled_.clear(); + fabricAccessEnabled_.resize(num_devices, -1); } -} // namespace detail +} // namespace detail bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) { at::globalContext().lazyInitDevice(c10::DeviceType::CUDA); - TORCH_CHECK(dev >= 0 || dev < num_devices_, - dev, " is not a device"); - TORCH_CHECK(dev_to_access >= 0 || dev_to_access < num_devices_, - dev_to_access, " is not a device"); + TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device"); + TORCH_CHECK( + dev_to_access >= 0 || dev_to_access < num_devices_, + dev_to_access, + " is not a device"); TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized"); - auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access]; + auto& cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access]; if (cache != -1) { return cache; @@ -58,4 +65,118 @@ bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) { return cache; } -} // namespace at::cuda::detail +namespace { +#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED + +nvmlDevice_t get_nvml_device(c10::DeviceIndex dev) { + static bool nvml_init [[maybe_unused]] = []() { + TORCH_INTERNAL_ASSERT(NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_()); + return true; + }(); + + auto prop = at::cuda::getDeviceProperties(dev); + char pci_id // NOLINT(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) + [NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + snprintf( + pci_id, + sizeof(pci_id), + NVML_DEVICE_PCI_BUS_ID_FMT, + prop->pciDomainID, + prop->pciBusID, + prop->pciDeviceID); + + nvmlDevice_t nvml_device = nullptr; + TORCH_INTERNAL_ASSERT( + NVML_SUCCESS == + DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_( + pci_id, &nvml_device)); + return nvml_device; +} + +bool isFabricSupported() { + // 1. try allocating memory + CUmemGenericAllocationHandle handle = 0; + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + + size_t granularity{}; + const auto driver_api = c10::cuda::DriverAPI::get(); + C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_( + &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + + auto status = driver_api->cuMemCreate_(&handle, granularity, &prop, 0); + if (status != CUDA_SUCCESS) { + LOG(INFO) + << "status " << status + << " Could not allocate memory with FABRIC handle, falling back to fd handle exchange\n"; + return false; + } + // 2. check export + CUmemFabricHandle sharedHandle; + status = driver_api->cuMemExportToShareableHandle_( + &sharedHandle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0); + if (status != CUDA_SUCCESS) { + LOG(INFO) + << "status " << status + << " Could not export FABRIC handle, falling back to fd handle exchange\n"; + driver_api->cuMemRelease_(handle); + return false; + } + // 3. check import + CUmemGenericAllocationHandle import_handle = 0; + status = driver_api->cuMemImportFromShareableHandle_( + &import_handle, &sharedHandle, CU_MEM_HANDLE_TYPE_FABRIC); + if (status != CUDA_SUCCESS) { + LOG(INFO) + << "status " << status + << " Could not import FABRIC handle, falling back to fd handle exchange\n"; + driver_api->cuMemRelease_(handle); + return false; + } + driver_api->cuMemRelease_(import_handle); + driver_api->cuMemRelease_(handle); + LOG(INFO) << "using fabric to exchange memory handles\n"; + return true; +} +#endif +} // namespace + +bool get_fabric_access(c10::DeviceIndex dev) { +#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED + at::globalContext().lazyInitDevice(c10::DeviceType::CUDA); + + TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device"); + auto& cache = fabricAccessEnabled_[dev]; + if (cache != -1) { + return cache; + } + auto nvml_device = get_nvml_device(dev); + if (nvml_device != nullptr) { + nvmlGpuFabricInfoV_t fabricInfo; + fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED; + fabricInfo.version = nvmlGpuFabricInfo_v2; + if (DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_ == nullptr) { + return false; + } + TORCH_CHECK( + NVML_SUCCESS == + DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_( + nvml_device, &fabricInfo)); + auto state = fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED; + if (state) { + // now perform the full cycle of allocating - exporting - importing memory + state = isFabricSupported(); + } + cache = state ? 1 : 0; + return cache; + } else { + return false; + } +#else + return false; +#endif +} + +} // namespace at::cuda diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.h b/aten/src/ATen/cuda/PeerToPeerAccess.h index 5b63a855f3f46..30d21af83ed88 100644 --- a/aten/src/ATen/cuda/PeerToPeerAccess.h +++ b/aten/src/ATen/cuda/PeerToPeerAccess.h @@ -8,5 +8,6 @@ void init_p2p_access_cache(int64_t num_devices); } TORCH_CUDA_CPP_API bool get_p2p_access(c10::DeviceIndex source_dev, c10::DeviceIndex dest_dev); +TORCH_CUDA_CPP_API bool get_fabric_access(c10::DeviceIndex device); } // namespace at::cuda diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh index aad19c6771ed7..b809512692093 100644 --- a/aten/src/ATen/cuda/cub_definitions.cuh +++ b/aten/src/ATen/cuda/cub_definitions.cuh @@ -54,7 +54,7 @@ // There were many bc-breaking changes in major version release of CCCL v3.0.0 // Please see https://nvidia.github.io/cccl/cccl/3.0_migration_guide.html -#if CUB_VERSION >= 300000 +#if CUB_VERSION >= 200800 #define CUB_V3_PLUS() true #else #define CUB_V3_PLUS() false diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 247fdb2537cb4..72826b5847925 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -19,10 +19,6 @@ #include #include -#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) -#include -#endif - #if AT_CUDNN_ENABLED() #include #endif @@ -93,29 +89,6 @@ void CUDAHooks::init() const { // have a chance to enable vitals. at::vitals::VitalsAPI.setVital("CUDA", "used", "true", /* force = */ true); - // Sets the CUDA_MODULE_LOADING environment variable - // if it's not set by the user. - // CUDA_MODULE_LOADING="LAZY" is default for all drivers released for CUDA 12.2+. - // Check the driver version and only set the env variable if needed. - bool set_lazy_module_loading = true; - #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) - auto driver_api = c10::cuda::DriverAPI::get(); - // Initialize NVML - if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) { - // Get the driver version - int version = -1; - auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version); - if (res == NVML_SUCCESS) { - // Check if driver is sufficiently new - if (version >= 12020) { - set_lazy_module_loading = false; - } - } - } - #endif - if (set_lazy_module_loading) { - c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false); - } const auto num_devices = c10::cuda::device_count_ensure_non_zero(); c10::cuda::CUDACachingAllocator::init(num_devices); at::cuda::detail::init_p2p_access_cache(num_devices); @@ -207,6 +180,27 @@ bool CUDAHooks::hasCuBLASLt() const { #endif } + +bool CUDAHooks::hasCKSDPA() const { +#if !defined(USE_ROCM) + return false; +#elif defined(USE_ROCM) && defined(USE_ROCM_CK_SDPA) + return true; +#else + return false; +#endif +} + +bool CUDAHooks::hasCKGEMM() const { +#if !defined(USE_ROCM) + return false; +#elif defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) + return true; +#else + return false; +#endif +} + bool CUDAHooks::hasROCM() const { // Currently, this is same as `compiledWithMIOpen`. // But in future if there are ROCm builds without MIOpen, diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index b0dac7a71e809..2780369a37b71 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -31,6 +31,8 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool hasCuSOLVER() const override; bool hasCuBLASLt() const override; bool hasROCM() const override; + bool hasCKSDPA() const override; + bool hasCKGEMM() const override; const at::cuda::NVRTC& nvrtc() const override; DeviceIndex current_device() const override; bool isBuilt() const override {return true;} diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh index a65db3f2df12a..487e798bd80f6 100644 --- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh +++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh @@ -49,12 +49,12 @@ struct OffsetCalculator { #if defined(USE_ROCM) if ((dims > 0) && (dims <= 2)) { auto divmod = sizes_[0].divmod(linear_idx); - #pragma unroll +#pragma unroll for (int arg = 0; arg < NARGS; arg++) offsets[arg] = divmod.mod * strides_[0][arg]; if (dims >= 2) { divmod = sizes_[1].divmod(divmod.div); - #pragma unroll +#pragma unroll for (int arg = 0; arg < NARGS; arg++) offsets[arg] += divmod.mod * strides_[1][arg]; } diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index d89875865b887..aca83386ad421 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -117,6 +117,8 @@ namespace at::cuda { _(nvrtcGetPTXSize) \ _(nvrtcGetPTX) \ _(cuModuleLoadData) \ + _(cuModuleLoad) \ + _(cuGetErrorString) \ _(cuModuleGetFunction) \ _(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \ _(nvrtcGetErrorString) \ diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h index 670137e48cbc3..1f71a61c0fba1 100644 --- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h +++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h @@ -91,7 +91,6 @@ constexpr hipDataType HipDataTypeFor() { #if ROCM_VERSION >= 70000 return HIP_R_4F_E2M1; #else - // Return HIP_R_4F_E2M1 enum value for earlier ROCm version. return static_cast(33); #endif } diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp index 9972cbd1c1514..3511e48ae061a 100644 --- a/aten/src/ATen/cuda/tunable/Tunable.cpp +++ b/aten/src/ATen/cuda/tunable/Tunable.cpp @@ -220,19 +220,17 @@ TuningResultsValidator::TuningResultsValidator() { []() { return GetPyTorchVersion(); }, [this](auto&& k) { return ValidatePyTorchVersion(std::forward(k)); }); #ifdef USE_ROCM - // rocm + // hip { -#ifdef _WIN32 - std::string rocm_version = HIP_VERSION_BUILD_NAME; -#else - std::string rocm_version = ROCM_BUILD_INFO; -#endif + // HIP version is more accurate than ROCm version. User's environment could be a stock + // ROCm install but with a mix of newer components, making ROCm version meaningless. + std::string hip_version = c10::str(TORCH_HIP_VERSION); RegisterValidator( - "ROCM_VERSION", - [rocm_version]() { return rocm_version; }, - [rocm_version](auto&& k) { - TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version); - return rocm_version == k ? OK : FAIL; + "HIP_VERSION", + [hip_version]() { return hip_version; }, + [hip_version](auto&& k) { + TUNABLE_LOG1("HIP_VERSION validation: expect ", k, " to match ", hip_version); + return hip_version == k ? OK : FAIL; }); } // gfx arch diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index 6c2492b12e6b9..85f0286542e75 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -38,6 +38,7 @@ inline int dataSize(cudnnDataType_t dataType) } } +// NOTE [ cudnn fixSizeOneDimStride ] // The stride for a size-1 dimensions is not uniquely determined; in // fact, it can be anything you want, because the fact that the // tensor is size 1 at this dimension means that you will never actually diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index f99e03d156c9b..00573e3cf701b 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -118,6 +118,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { return false; } + virtual bool hasCKSDPA() const { + return false; + } + + virtual bool hasCKGEMM() const { + return false; + } + virtual const at::cuda::NVRTC& nvrtc() const { TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP); } diff --git a/aten/src/ATen/detail/MTIAHooksInterface.cpp b/aten/src/ATen/detail/MTIAHooksInterface.cpp index b6e260e59ec41..d2e331abb0c04 100644 --- a/aten/src/ATen/detail/MTIAHooksInterface.cpp +++ b/aten/src/ATen/detail/MTIAHooksInterface.cpp @@ -21,6 +21,10 @@ bool isMTIAHooksBuilt() { } // namespace detail +bool MTIAHooksInterface::isAvailable() const { + return detail::isMTIAHooksBuilt() && detail::getMTIAHooks().deviceCount() > 0; +} + C10_DEFINE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs) } // namespace at diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h index fb8ed6fb23226..b415862f29e7c 100644 --- a/aten/src/ATen/detail/MTIAHooksInterface.h +++ b/aten/src/ATen/detail/MTIAHooksInterface.h @@ -149,6 +149,8 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { FAIL_MTIAHOOKS_FUNC(__func__); return; } + + virtual bool isAvailable() const override; }; struct TORCH_API MTIAHooksArgs {}; diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp index de69e5c1e23a4..6e63708a90f4a 100644 --- a/aten/src/ATen/functorch/BatchRulesModules.cpp +++ b/aten/src/ATen/functorch/BatchRulesModules.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -44,8 +45,13 @@ static std::tuple> embedding_batch_rule( const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight); auto indices_ = moveBatchDimToFront(indices, indices_bdim); - const auto range = getStepTensor(indices, batch_size, num_embeddings); - indices_ = indices_ + range; + { + // getStepTensor returns a regular Tensor. If indices_ is a DTensor + // we want to allow this mixed DTensor-Tensor operation. + at::DTensorAllowImplicitReplication guard; + const auto range = getStepTensor(indices, batch_size, num_embeddings); + indices_ = indices_ + range; + } auto result = at::embedding_symint(weight_, indices_, std::move(padding_idx), scale_grad_by_freq, sparse); return std::make_tuple(std::move(result), 0); } diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h index 39ab441478e8f..f4316def4fb42 100644 --- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h @@ -1,7 +1,6 @@ #pragma once -#include -#include +#include // Use of c10::hip namespace here makes hipification easier, because // I don't have to also fix namespaces. Sorry! @@ -10,22 +9,227 @@ namespace c10::hip { // Takes a valid HIPAllocator (of any sort) and turns it into // an allocator pretending to be a CUDA allocator. See // Note [Masquerading as CUDA] -class HIPAllocatorMasqueradingAsCUDA final : public Allocator { - Allocator* allocator_; +class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllocator { + HIPCachingAllocator::HIPAllocator* allocator_; public: - explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator) + explicit HIPAllocatorMasqueradingAsCUDA(HIPCachingAllocator::HIPAllocator* allocator) : allocator_(allocator) {} + + virtual ~HIPAllocatorMasqueradingAsCUDA() = default; + + // From c10::Allocator + DataPtr allocate(size_t size) override { DataPtr r = allocator_->allocate(size); r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index())); return r; } + + bool is_simple_data_ptr(const DataPtr& data_ptr) const override { + return allocator_->is_simple_data_ptr(data_ptr); + } + DeleterFnPtr raw_deleter() const override { return allocator_->raw_deleter(); } + void copy_data(void* dest, const void* src, std::size_t count) const final { allocator_->copy_data(dest, src, count); } + + // From DeviceAllocator + + bool initialized() override { + return allocator_->initialized(); + } + + void emptyCache(MempoolId_t mempool_id = {0, 0}) override { + allocator_->emptyCache(mempool_id); + } + + void recordStream(const DataPtr& ptr, c10::Stream stream) override { + HIPStream hip_stream = HIPStream(stream); + recordStream(ptr, hip_stream); + } + + CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override { + return allocator_->getDeviceStats(device); + } + + void resetAccumulatedStats(c10::DeviceIndex device) override { + allocator_->resetAccumulatedStats(device); + } + + void resetPeakStats(c10::DeviceIndex device) override { + allocator_->resetPeakStats(device); + } + + // From CUDAAllocator + + void* raw_alloc(size_t nbytes) override { + return allocator_->raw_alloc(nbytes); + } + + void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override { + return allocator_->raw_alloc_with_stream(nbytes, stream); + } + + void raw_delete(void* ptr) override { + allocator_->raw_delete(ptr); + } + + void init(int device_count) override { + allocator_->init(device_count); + } + + double getMemoryFraction(c10::DeviceIndex device) override { + return allocator_->getMemoryFraction(device); + } + + void setMemoryFraction(double fraction, c10::DeviceIndex device) override { + allocator_->setMemoryFraction(fraction, device); + } + + void enable(bool value) override { + allocator_->enable(value); + } + + bool isEnabled() const override { + return allocator_->isEnabled(); + } + + void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override { + allocator_->cacheInfo(device, largestBlock); + } + + void* getBaseAllocation(void* ptr, size_t* size) override { + return allocator_->getBaseAllocation(ptr, size); + } + + void recordStream(const DataPtr& ptr, HIPStream stream) override { + allocator_->recordStream(ptr, stream); + } + + HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override { + return allocator_->snapshot(mempool_id); + } + + void beginAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + std::function filter) override { + allocator_->beginAllocateToPool(device, mempool_id, filter); + } + + void endAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id) override { + allocator_->endAllocateToPool(device, mempool_id); + } + + void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override { + allocator_->releasePool(device, mempool_id); + } + + int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) override { + return allocator_->getPoolUseCount(device, mempool_id); + } + + void createOrIncrefPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + HIPAllocator* allocator = nullptr) override { + allocator_->createOrIncrefPool(device, mempool_id, allocator); + } + + void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override { + allocator_->setUseOnOOM(device, mempool_id); + } + + bool checkPoolLiveAllocations( + c10::DeviceIndex device, + MempoolId_t mempool_id, + const std::unordered_set& expected_live_allocations) override { + return allocator_->checkPoolLiveAllocations(device, mempool_id, expected_live_allocations); + } + + HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) override { + return allocator_->shareIpcHandle(ptr); + } + + std::shared_ptr getIpcDevPtr(std::string handle) override { + return allocator_->getIpcDevPtr(handle); + } + + bool isHistoryEnabled() override { + return allocator_->isHistoryEnabled(); + } + + void recordHistory( + bool enabled, + HIPCachingAllocator::CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + HIPCachingAllocator::RecordContext when, + bool clearHistory) override { + allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory); + } + + void recordAnnotation( + const std::vector>& md) override { + allocator_->recordAnnotation(md); + } + + void pushCompileContext(std::string& md) override { + allocator_->pushCompileContext(md); + } + + void popCompileContext() override { + allocator_->popCompileContext(); + } + + void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override { + allocator_->attachOutOfMemoryObserver(observer); + } + + void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) override { + allocator_->attachAllocatorTraceTracker(tracker); + } + + void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) override { + allocator_->enablePeerAccess(dev, dev_to_access); + } + + hipError_t memcpyAsync( + void* dst, + int dstDevice, + const void* src, + int srcDevice, + size_t count, + hipStream_t stream, + bool p2p_enabled) override { + return allocator_->memcpyAsync(dst, dstDevice, src, srcDevice, count, stream, p2p_enabled); + } + + std::shared_ptr getCheckpointState( + c10::DeviceIndex device, + MempoolId_t id) override { + return allocator_->getCheckpointState(device, id); + } + + HIPCachingAllocator::CheckpointDelta setCheckpointPoolState( + c10::DeviceIndex device, + std::shared_ptr pps) override { + auto cpd = allocator_->setCheckpointPoolState(device, pps); + for (auto& ptr : cpd.dataptrs_allocd) { + ptr.unsafe_set_device(Device(c10::DeviceType::CUDA, ptr.device().index())); + } + return cpd; + } + + std::string name() override { + return allocator_->name(); + } + }; } // namespace c10::hip diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp index 46f7d247293a1..53e7980b3d3f9 100644 --- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp +++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp @@ -1,10 +1,11 @@ -#include +#include +#include #include namespace c10 { namespace hip { namespace HIPCachingAllocatorMasqueradingAsCUDA { -Allocator* get() { +HIPCachingAllocator::HIPAllocator* get() { static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get()); return &allocator; } diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h index 3aaa9d06c5e91..1d3606b456fca 100644 --- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h @@ -10,9 +10,185 @@ class DataPtr; namespace hip { namespace HIPCachingAllocatorMasqueradingAsCUDA { -C10_HIP_API Allocator* get(); +C10_HIP_API HIPCachingAllocator::HIPAllocator* get(); C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream); +inline void* raw_alloc(size_t nbytes) { + return get()->raw_alloc(nbytes); +} + +inline void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) { + return get()->raw_alloc_with_stream(nbytes, stream); +} + +inline void raw_delete(void* ptr) { + return get()->raw_delete(ptr); +} + +inline void init(int device_count) { + return get()->init(device_count); +} + +inline double getMemoryFraction(c10::DeviceIndex device) { + return get()->getMemoryFraction(device); +} + +inline void setMemoryFraction(double fraction, c10::DeviceIndex device) { + return get()->setMemoryFraction(fraction, device); +} + +inline void emptyCache(MempoolId_t mempool_id = {0, 0}) { + return get()->emptyCache(mempool_id); +} + +inline void enable(bool value) { + return get()->enable(value); +} + +inline bool isEnabled() { + return get()->isEnabled(); +} + +inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) { + return get()->cacheInfo(device, largestBlock); +} + +inline void* getBaseAllocation(void* ptr, size_t* size) { + return get()->getBaseAllocation(ptr, size); +} + +inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats( + c10::DeviceIndex device) { + return get()->getDeviceStats(device); +} + +inline void resetAccumulatedStats(c10::DeviceIndex device) { + return get()->resetAccumulatedStats(device); +} + +inline void resetPeakStats(c10::DeviceIndex device) { + return get()->resetPeakStats(device); +} + +inline HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) { + return get()->snapshot(mempool_id); +} + +inline std::shared_ptr getCheckpointState( + c10::DeviceIndex device, + MempoolId_t id) { + return get()->getCheckpointState(device, id); +} + +inline HIPCachingAllocator::CheckpointDelta setCheckpointPoolState( + c10::DeviceIndex device, + std::shared_ptr pps) { + return get()->setCheckpointPoolState(device, std::move(pps)); +} + +inline void beginAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + std::function filter) { + get()->beginAllocateToPool(device, mempool_id, std::move(filter)); +} + +inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->endAllocateToPool(device, mempool_id); +} + +inline void recordHistory( + bool enabled, + HIPCachingAllocator::CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + HIPCachingAllocator::RecordContext when, + bool clearHistory) { + return get()->recordHistory( + enabled, context_recorder, alloc_trace_max_entries, when, clearHistory); +} + +inline void recordAnnotation( + const std::vector>& md) { + return get()->recordAnnotation(md); +} + +inline void pushCompileContext(std::string& md) { + return get()->pushCompileContext(md); +} + +inline void popCompileContext() { + return get()->popCompileContext(); +} + +inline bool isHistoryEnabled() { + return get()->isHistoryEnabled(); +} + +inline bool checkPoolLiveAllocations( + c10::DeviceIndex device, + MempoolId_t mempool_id, + const std::unordered_set& expected_live_allocations) { + return get()->checkPoolLiveAllocations( + device, mempool_id, expected_live_allocations); +} + +inline void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) { + return get()->attachOutOfMemoryObserver(std::move(observer)); +} + +inline void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) { + return get()->attachAllocatorTraceTracker(std::move(tracker)); +} + +inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) { + return get()->releasePool(device, mempool_id); +} + +inline void createOrIncrefPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + HIPCachingAllocator::HIPAllocator* allocator_ptr = nullptr) { + get()->createOrIncrefPool(device, mempool_id, allocator_ptr); +} + +inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->setUseOnOOM(device, mempool_id); +} + +inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) { + return get()->getPoolUseCount(device, mempool_id); +} + +inline std::shared_ptr getIpcDevPtr(std::string handle) { + return get()->getIpcDevPtr(std::move(handle)); +} + +inline HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) { + return get()->shareIpcHandle(ptr); +} + +inline std::string name() { + return get()->name(); +} + +inline hipError_t memcpyAsync( + void* dst, + int dstDevice, + const void* src, + int srcDevice, + size_t count, + hipStream_t stream, + bool p2p_enabled) { + return get()->memcpyAsync( + dst, dstDevice, src, srcDevice, count, stream, p2p_enabled); +} + +inline void enablePeerAccess( + c10::DeviceIndex dev, + c10::DeviceIndex dev_to_access) { + return get()->enablePeerAccess(dev, dev_to_access); +} + } // namespace HIPCachingAllocatorMasqueradingAsCUDA } // namespace hip } // namespace c10 diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp index 08c09b88f99cb..86e42ee3b66dc 100644 --- a/aten/src/ATen/miopen/Descriptors.cpp +++ b/aten/src/ATen/miopen/Descriptors.cpp @@ -19,31 +19,37 @@ inline miopenDataType_t getDataType(const at::Tensor& t) { } else { TORCH_CHECK( false, - "TensorDescriptor only supports float, half and bfloat16 tensors"); + "TensorDescriptor does not support ", scalar_type); } } } // anonymous namespace +constexpr size_t MIOPEN_DIM_MAX = 5; -void TensorDescriptor::set(const at::Tensor &t, size_t pad) { - set(getDataType(t), t.sizes(), t.strides(), pad); +void TensorDescriptor::set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad) { + set(getDataType(t), t.sizes(), t.strides(), pad, + memory_format == at::MemoryFormat::ChannelsLast || + memory_format == at::MemoryFormat::ChannelsLast3d); } -constexpr size_t MIOPEN_DIM_MAX = 5; +void TensorDescriptor::set(const at::Tensor &t, size_t pad) { + auto memory_format = t.suggest_memory_format(); + set(getDataType(t), t.sizes(), t.strides(), pad, + memory_format == at::MemoryFormat::ChannelsLast || + memory_format == at::MemoryFormat::ChannelsLast3d); +} void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad) { + set(datatype, t_sizes, t_strides, pad, + is_channels_last_strides_2d(t_sizes, t_strides) || + is_channels_last_strides_3d(t_sizes, t_strides)); +} + +void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad, bool nhwc) { size_t dim = t_sizes.size(); if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX) -#define _STR(X) #X -#define STR(X) _STR(X) - TORCH_CHECK( - false, - "MIOpen supports only up to ", - STR(MIOPEN_DIM_MAX), - " dimensions"); -#undef _STR -#undef STR + TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions"); int size[MIOPEN_DIM_MAX]; int stride[MIOPEN_DIM_MAX]; for (const auto i : c10::irange(dim)) { @@ -54,7 +60,7 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr size[i] = 1; stride[i] = 1; } - set(datatype, static_cast(std::max(dim, pad)), size, stride); + set(datatype, static_cast(std::max(dim, pad)), size, stride, nhwc); } std::string miopenTypeToString(miopenDataType_t dtype) { @@ -74,10 +80,11 @@ std::string miopenTypeToString(miopenDataType_t dtype) { std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) { out << "TensorDescriptor " << static_cast(d.desc()) << "\n"; - int nbDims = 4; + int nbDims = 0; int dimA[MIOPEN_DIM_MAX]; int strideA[MIOPEN_DIM_MAX]; miopenDataType_t dtype; + miopenGetTensorDescriptorSize(d.desc(), &nbDims); miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA); out << " type = " << miopenTypeToString(dtype) << "\n"; out << " nbDims = " << nbDims << "\n"; @@ -99,19 +106,17 @@ void TensorDescriptor::print() { std::cout << *this; } void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad) { auto dim = t.ndimension(); - if (dim > static_cast(MIOPEN_DIM_MAX) || pad > static_cast(MIOPEN_DIM_MAX)) { -#define _STR(X) #X -#define STR(X) _STR(X) - TORCH_CHECK( - false, - "MIOpen supports only up to ", - STR(MIOPEN_DIM_MAX), - " dimensions"); -#undef _STR -#undef STR - } + if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX) + TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions"); + // NB: It is possible for this test to be insufficient, because the + // Tensor passed in to set the filter descriptor may not be the actual + // Tensor whose data pointer is passed to cuDNN. Nevertheless, + // that is the common case, so we can catch most client errors with this test. TORCH_CHECK(t.is_contiguous(memory_format), - "MIOpen filters (a.k.a. weights) must be contiguous"); + "MIOpen filters (a.k.a. weights) must be contiguous in desired memory_format\n", + "Weight sizes: ", t.sizes(), "\n", + "Weight strides: ", t.strides(), "\n", + "cuDNN suggested memory_format: ", memory_format); int size[MIOPEN_DIM_MAX]; int stride[MIOPEN_DIM_MAX]; @@ -131,7 +136,9 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo } dim = std::max(dim, pad); - set(getDataType(t), (int) dim, size, stride); + set(getDataType(t), static_cast(dim), size, stride, + memory_format == at::MemoryFormat::ChannelsLast || + memory_format == at::MemoryFormat::ChannelsLast3d); } }} diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h index 2eee837cd533d..8825575c9231b 100644 --- a/aten/src/ATen/miopen/Descriptors.h +++ b/aten/src/ATen/miopen/Descriptors.h @@ -9,6 +9,8 @@ namespace at { namespace native { +std::string miopenTypeToString(miopenDataType_t dtype); + inline int dataSize(miopenDataType_t dataType) { switch (dataType) { @@ -19,6 +21,32 @@ inline int dataSize(miopenDataType_t dataType) } } +// See NOTE [ cudnn fixSizeOneDimStride ] in aten/src/ATen/cudnn/Descriptors.h +template +static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) { + int64_t z = 1; + int index = 0; + std::vector permutation(dim); + + if (nhwc) { + permutation[index++] = 1; + } + for (int d = dim-1; d > 1; d--) { + permutation[index++] = d; + } + if (!nhwc) { + permutation[index++] = 1; + } + permutation[index++] = 0; + for (int d : permutation) { + if (size[d] == 1) { + stride[d] = z; + } else { + z *= size[d]; + } + } +} + template struct DescriptorDeleter { void operator()(T* x) { @@ -75,14 +103,20 @@ class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor< set(t, pad); } + // See Note [CuDNN broadcast padding] void set(const at::Tensor &t, size_t pad = 0); + void set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad = 0); void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0); void print(); private: - void set(miopenDataType_t dataType, int dim, int* size, int* stride) { - MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride)); + void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc); + + void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) { + std::vector strides_copy(stride, stride + dim); + fixSizeOneDimStride(dim, size, strides_copy.data(), nhwc); + MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data())); } }; @@ -100,8 +134,10 @@ class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor< void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0); private: - void set(miopenDataType_t dataType, int dim, int* size, int* stride) { - MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride)); + void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) { + std::vector strides_copy(stride, stride + dim); + fixSizeOneDimStride(dim, size, strides_copy.data(), nhwc); + MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data())); } }; @@ -166,4 +202,4 @@ union Constant } }; -}} // namespace +}} // namespace diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp index 7b04d65ebdd02..d858df0733975 100644 --- a/aten/src/ATen/mps/EmptyTensor.cpp +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -43,7 +43,6 @@ TensorBase empty_mps( int64_t nelements = c10::multiply_integers(size); auto dtype = dtype_or_default(dtype_opt); TORCH_CHECK_TYPE(dtype != ScalarType::Double, MPS_ERROR_DOUBLE_NOT_SUPPORTED); - TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16 || is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_14_0_PLUS), "MPS BFloat16 is only supported on MacOS 14 or newer"); auto dtype_meta = scalarTypeToTypeMeta(dtype); diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h index a70ce25108201..9b58477104978 100644 --- a/aten/src/ATen/mps/MPSDevice.h +++ b/aten/src/ATen/mps/MPSDevice.h @@ -18,11 +18,7 @@ namespace at::mps { // Helper enum to check if a MPSGraph op is supported in a given macOS version enum class MacOSVersion : uint32_t { - MACOS_VER_13_1_PLUS = 0, - MACOS_VER_13_2_PLUS, - MACOS_VER_13_3_PLUS, - MACOS_VER_14_0_PLUS, - MACOS_VER_14_4_PLUS, + MACOS_VER_14_4_PLUS = 0, MACOS_VER_15_0_PLUS, MACOS_VER_15_1_PLUS, MACOS_VER_15_2_PLUS, @@ -59,6 +55,17 @@ class TORCH_API MPSDevice { */ bool isMacOS13Plus(MacOSVersion version) const; + /** + * Returns device name + */ + std::string getName() const; + + /** + * Returns number of GPU cores. + * 1 Core = 16 ExecutionUnit x 8 ALU x 24 threads + */ + unsigned getCoreCount() const; + ~MPSDevice(); private: diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm index 55af5f83b388c..5a37490c02402 100644 --- a/aten/src/ATen/mps/MPSDevice.mm +++ b/aten/src/ATen/mps/MPSDevice.mm @@ -32,11 +32,11 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de MPSDevice::MPSDevice() : _mtl_device(nil) { // Check that MacOS 13.0+ version of MPS framework is available - // Create the MPSGraph and check method introduced in 13.0 + // Create the MPSGraph and check method introduced in 14.0 // which is used by MPS backend. id mpsCD = NSClassFromString(@"MPSGraph"); - if ([mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == NO) { + if ([mpsCD instancesRespondToSelector:@selector(HermiteanToRealFFTWithTensor:axes:descriptor:name:)] == NO) { return; } @@ -66,24 +66,12 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de isOperatingSystemAtLeastVersion:{.majorVersion = major, .minorVersion = minor, .patchVersion = 0}]; } }; - static bool _macos_13_1_plus = is_os_version_at_least(13, 1); - static bool _macos_13_2_plus = is_os_version_at_least(13, 2); - static bool _macos_13_3_plus = is_os_version_at_least(13, 3); - static bool _macos_14_0_plus = is_os_version_at_least(14, 0); static bool _macos_14_4_plus = is_os_version_at_least(14, 4); static bool _macos_15_0_plus = is_os_version_at_least(15, 0); static bool _macos_15_1_plus = is_os_version_at_least(15, 1); static bool _macos_15_2_plus = is_os_version_at_least(15, 2); switch (version) { - case MacOSVersion::MACOS_VER_13_1_PLUS: - return _macos_13_1_plus; - case MacOSVersion::MACOS_VER_13_2_PLUS: - return _macos_13_2_plus; - case MacOSVersion::MACOS_VER_13_3_PLUS: - return _macos_13_3_plus; - case MacOSVersion::MACOS_VER_14_0_PLUS: - return _macos_14_0_plus; case MacOSVersion::MACOS_VER_14_4_PLUS: return _macos_14_4_plus; case MacOSVersion::MACOS_VER_15_0_PLUS: @@ -97,10 +85,36 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de } } +std::string MPSDevice::getName() const { + @autoreleasepool { + return [[_mtl_device name] UTF8String]; + } +} + +unsigned MPSDevice::getCoreCount() const { + io_iterator_t iterator = 0; + io_registry_entry_t entry = 0; + int core_count = 0; + auto matchingDict = IOServiceMatching("AGXAccelerator"); + TORCH_INTERNAL_ASSERT(matchingDict, "Failed to create matching dict"); + const auto status = IOServiceGetMatchingServices(kIOMainPortDefault, matchingDict, &iterator); + TORCH_INTERNAL_ASSERT(status == KERN_SUCCESS); + while ((entry = IOIteratorNext(iterator)) != 0) { + auto property = IORegistryEntryCreateCFProperty(entry, CFSTR("gpu-core-count"), kCFAllocatorDefault, 0); + auto found = CFNumberGetValue(static_cast(property), kCFNumberIntType, &core_count); + CFRelease(property); + IOObjectRelease(entry); + if (found) { + break; + } + } + IOObjectRelease(iterator); + return core_count; +} + at::Allocator* GetMPSAllocator(bool useSharedAllocator) { return getIMPSAllocator(useSharedAllocator); } - bool is_available() { return MPSDevice::getInstance()->device() != nil; } diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm index f6133e8877222..a2ec221c1bfea 100644 --- a/aten/src/ATen/mps/MPSHooks.mm +++ b/aten/src/ATen/mps/MPSHooks.mm @@ -34,7 +34,7 @@ case 14: switch (minor) { case 0: - return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS); + return true; case 4: return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS); default: @@ -42,19 +42,7 @@ return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS); } case 13: - switch (minor) { - case 0: - return true; - case 1: - return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_1_PLUS); - case 2: - return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS); - case 3: - return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - default: - TORCH_WARN("Can't check whether running on 13.", minor, "+ returning one for 13.3+"); - return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - } + return true; default: TORCH_WARN("Checking for unexpected MacOS ", major, ".", minor, " returning false"); return false; diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp index 674ccf11cfb9b..49366151ae60b 100644 --- a/aten/src/ATen/native/Blas.cpp +++ b/aten/src/ATen/native/Blas.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #if !defined(__s390x__) && !defined(__powerpc__) #include #endif @@ -332,4 +333,23 @@ _scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b, return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out); } +// TODO(vasiliy, future PR): figure out why we need to declare this function, when +// other functions that live in ATen/native/*.cpp without declarations +// or headers work just fine. +Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype); + +Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype) { + _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype); + const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); + return out; +} + } // namespace at::native diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp index 79dbe7353e159..e06afddd05aa7 100644 --- a/aten/src/ATen/native/CPUBlas.cpp +++ b/aten/src/ATen/native/CPUBlas.cpp @@ -51,7 +51,7 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int * // brgemm_pack_B is changed to transform and the setting of brgemm beta is changed to set_add_C #if (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR == 5) #define ONEDNN_UKERNEL_1 -#elif (IDEEP_VERSION_MAJOR >= 3 && IDEEP_VERSION_MINOR >= 6) +#elif ((IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR >= 6) || (IDEEP_VERSION_MAJOR > 3)) #define ONEDNN_UKERNEL_2 #endif #if ((defined(ONEDNN_UKERNEL_1) || defined(ONEDNN_UKERNEL_2)) && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))) @@ -496,18 +496,18 @@ void gemm( // for the fallback path, first compute gemm with beta = 0, // and then add c in full precision. int64_t c_size = n * m; - std::vector float16_c(c_size, 0.f); - gemm_stub( + std::vector float_c(c_size, 0.f); + gemm_no_downcast_stub( at::kCPU, at::kHalf, - transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float16_c.data(), m); + transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m); for (const auto j : c10::irange(n)) { for (const auto i : c10::irange(m)) { auto offset = j * ldc + i; // beta == 0 won't propagate NaN from C if (beta == 0.f) { - c[offset] = c10::convert(float16_c[j * m + i]); + c[offset] = float_c[j * m + i]; } else { - c[offset] = beta * c[offset] + c10::convert(float16_c[j * m + i]); + c[offset] = beta * c[offset] + float_c[j * m + i]; } } } diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h index 95d11903dc773..8b75f12ebaf21 100644 --- a/aten/src/ATen/native/CPUBlas.h +++ b/aten/src/ATen/native/CPUBlas.h @@ -206,6 +206,16 @@ void copy(int64_t n, const c10::complex *x, int64_t incx, c10::complex float +#define CPUBLAS_BRGEMM_BF16BF16F32 // bfloat16 * bfloat16 -> float +#define CPUBLAS_BRGEMM_F32F32F32 // float * float -> float +#define CPUBLAS_BRGEMM_U8U8I32 // unsigned char * unsigned char -> int32 +#define CPUBLAS_BRGEMM_U8I8I32 // unsigned char * signed char -> int32 +#define CPUBLAS_BRGEMM_I8I8I32 // signed char * signed char -> int32 + TORCH_API void brgemm( int64_t M, int64_t N, diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 84381efe55b0b..e160c84ced331 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -353,19 +353,21 @@ TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable); TORCH_API bool _cudnn_get_conv_benchmark_empty_cache(); -inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { - +inline at::MemoryFormat miopen_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) { // disable NHWC for float64 input. if (!at::detail::getCUDAHooks().compiledWithMIOpen() || input.scalar_type() == at::kDouble || weight.scalar_type() == at::kDouble) { - return false; + return at::MemoryFormat::Contiguous; } // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen - // See #64427 - static std::optional PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC"); - static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC; + // See https://github.com/pytorch/pytorch/issues/64427. + // non static variable is used to be able to change environment variable in runtime for testing + // enabled by default for ROCm >= 7.0.0 with miopen 3.5 + int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0; + bool is_miopen_3_5 = miopen_version >= 30500; // ROCm 7.0 + bool suggest_nhwc = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC").value_or(is_miopen_3_5); auto input_memory_format = input.suggest_memory_format(); auto weight_memory_format = weight.suggest_memory_format(); @@ -375,13 +377,24 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten (input_memory_format == at::MemoryFormat::ChannelsLast) || (weight_memory_format == at::MemoryFormat::ChannelsLast) ); + if (can_use_miopen_channels_last_2d) { + return at::MemoryFormat::ChannelsLast; + } bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && ( (input_memory_format == at::MemoryFormat::ChannelsLast3d) || (weight_memory_format == at::MemoryFormat::ChannelsLast3d) ); + if (can_use_miopen_channels_last_3d) { + return at::MemoryFormat::ChannelsLast3d; + } + + return at::MemoryFormat::Contiguous; +} - return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d; +// deprecated, but to remove would be BC-breaking +inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + return miopen_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous; } inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index b926df11c21f3..ab427f396e345 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include @@ -299,67 +301,50 @@ struct ConvParams { bool allow_tf32{}; bool is_strided() const { - bool is_strided = false; - for (const auto& s : stride) { - is_strided |= (s != 1); - } - return is_strided; + return std::any_of( + stride.cbegin(), stride.cend(), [](const T& s) { return s != 1; }); } bool is_dilated() const { - bool is_dilated = false; - for (const auto& d : dilation) { - is_dilated |= (d != 1); - } - return is_dilated; + return std::any_of( + dilation.cbegin(), dilation.cend(), [](const T& d) { return d != 1; }); } bool is_padded() const { - bool is_padded = false; - for (auto p : padding) { - is_padded |= (p != 0); - } - return is_padded; + return std::any_of( + padding.cbegin(), padding.cend(), [](const T& p) { return p != 0; }); } bool is_output_padding_neg() const { - bool is_non_neg = false; - for (const auto& p : output_padding) { - is_non_neg |= (p < 0); - } - return is_non_neg; + return std::any_of( + output_padding.cbegin(), + output_padding.cend(), + [](const T& p) { return p < 0; }); } bool is_output_padding_big() const { - bool is_big = false; + // Revisit this with std::views::zip at C++20. for (auto i: c10::irange(output_padding.size())) { - is_big |= (output_padding[i] >= stride[i]); + if (output_padding[i] >= stride[i]) { + return true; + } } - return is_big; + return false; } bool is_padding_neg() const { - bool is_non_neg = false; - for (const auto& p : padding) { - is_non_neg |= (p < 0); - } - return is_non_neg; + return std::any_of( + padding.cbegin(), padding.cend(), [](const T& p) { return p < 0; }); } bool is_dilation_neg() const { - bool is_non_neg = false; - for (const auto& p : dilation) { - is_non_neg |= (p < 0); - } - return is_non_neg; + return std::any_of( + dilation.cbegin(), dilation.cend(), [](const T& d) { return d < 0; }); } bool is_stride_nonpos() const { - bool is_nonpos = false; - for (const auto& s : stride) { - is_nonpos |= (s <= 0); - } - return is_nonpos; + return std::any_of( + stride.cbegin(), stride.cend(), [](const T& s) { return s <= 0; }); } void view1d_as_2d() { @@ -458,12 +443,15 @@ struct ConvParams { // Use cudnn for FP16 depthwise convolutions bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const { + if (!detail::getCUDAHooks().compiledWithCuDNN()) { + return false; + } if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) { // always use cudnn_depthwise for channels_last format return true; } // native kernel doesn't support 64-bit non-splittable case - if (cudnn_enabled && needs_64bit_indexing_no_split(input, weight)) { + if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" @@ -1418,10 +1406,8 @@ static inline at::MemoryFormat determine_backend_memory_format( case ConvBackend::Miopen: case ConvBackend::MiopenDepthwise: case ConvBackend::MiopenTranspose: - if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) { - TORCH_INTERNAL_ASSERT((k == 4 || k == 5), - "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()"); - backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; + if (detail::getCUDAHooks().compiledWithMIOpen()) { + backend_memory_format = miopen_conv_suggest_memory_format(input, weight); } break; case ConvBackend::Mkldnn: diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp index 64c39fcaef239..cb437fb45ce21 100644 --- a/aten/src/ATen/native/ForeachOpsKernels.cpp +++ b/aten/src/ATen/native/ForeachOpsKernels.cpp @@ -260,6 +260,7 @@ namespace at::native { check_foreach_api_restrictions(input, tensors1, tensors2); \ \ std::vector result; \ + result.reserve(input.size()); \ for (const auto i : c10::irange(input.size())) { \ result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalar)); \ } \ @@ -288,6 +289,7 @@ namespace at::native { check_foreach_api_restrictions(input, tensors1, tensors2, scalars); \ \ std::vector result; \ + result.reserve(input.size()); \ for (const auto i : c10::irange(input.size())) { \ result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalars[i])); \ } \ @@ -417,6 +419,7 @@ std::vector foreach_tensor_ternary_lerp_slow( TensorList tensors3) { check_foreach_api_restrictions(tensors1, tensors2, tensors3); std::vector result; + result.reserve(tensors1.size()); for (const auto i : c10::irange(tensors1.size())) { result.emplace_back(tensors1[i].lerp(tensors2[i], tensors3[i])); } @@ -439,6 +442,7 @@ std::vector foreach_tensor_lerp_scalarlist_kernel_slow( at::ArrayRef scalars) { check_foreach_api_restrictions(tensors1, tensors2, scalars); std::vector result; + result.reserve(tensors1.size()); for (const auto i : c10::irange(tensors1.size())) { result.emplace_back(tensors1[i].lerp(tensors2[i], scalars[i])); } @@ -469,6 +473,7 @@ std::vector foreach_tensor_norm_slow( std::optional dtype) { check_foreach_api_restrictions(tensors); std::vector result; + result.reserve(tensors.size()); for (const auto& t : tensors) { result.emplace_back(at::linalg_vector_norm(t, ord, {}, false, dtype)); } @@ -478,6 +483,7 @@ std::vector foreach_tensor_norm_slow( std::vector foreach_tensor_max_slow(TensorList tensors) { check_foreach_api_restrictions(tensors); std::vector result; + result.reserve(tensors.size()); for (const auto& t : tensors) { result.emplace_back(at::max(t)); } diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index 56b7a6f98e779..f0dce20a6eff4 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -22,7 +22,7 @@ namespace { // Check if tensor list has either a boolean tensor or a integer tensor inline bool has_integral_tensor(TensorList tensors, const bool includeBool) { return std::any_of( - tensors.begin(), tensors.end(), [&includeBool](const auto& t) { + tensors.begin(), tensors.end(), [includeBool](const auto& t) { return at::isIntegralType(t.scalar_type(), includeBool); }); } @@ -53,8 +53,8 @@ inline void check_foreach_api_restrictions( inline void check_foreach_api_restrictions( TensorList tensors1, TensorList tensors2) { - TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor."); - TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor."); + check_foreach_api_restrictions(tensors1); + check_foreach_api_restrictions(tensors2); TORCH_CHECK( tensors1.size() == tensors2.size(), "Tensor lists must have the same number of tensors, got ", @@ -67,21 +67,8 @@ inline void check_foreach_api_restrictions( TensorList tensors1, TensorList tensors2, TensorList tensors3) { - TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor."); - TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor."); - TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor."); - TORCH_CHECK( - tensors1.size() == tensors2.size(), - "Tensor lists must have the same number of tensors, got ", - tensors1.size(), - " and ", - tensors2.size()); - TORCH_CHECK( - tensors1.size() == tensors3.size(), - "Tensor lists must have the same number of tensors, got ", - tensors1.size(), - " and ", - tensors3.size()); + check_foreach_api_restrictions(tensors1, tensors2); + check_foreach_api_restrictions(tensors1, tensors3); } inline void check_foreach_api_restrictions( @@ -90,12 +77,7 @@ inline void check_foreach_api_restrictions( TensorList tensors3, ArrayRef scalars) { check_foreach_api_restrictions(tensors1, tensors2, tensors3); - TORCH_CHECK( - tensors1.size() == scalars.size(), - "Tensor list must have same number of elements as scalar list, got ", - tensors1.size(), - " and ", - scalars.size()); + check_foreach_api_restrictions(tensors1, scalars); } inline void check_foreach_api_restrictions( @@ -103,12 +85,7 @@ inline void check_foreach_api_restrictions( TensorList tensors2, ArrayRef scalars) { check_foreach_api_restrictions(tensors1, tensors2); - TORCH_CHECK( - tensors1.size() == scalars.size(), - "Tensor list must have same number of elements as scalar list, got ", - tensors1.size(), - " and ", - scalars.size()); + check_foreach_api_restrictions(tensors1, scalars); } // Helper function called in check_fast_path_restrictions to check whether all @@ -126,15 +103,13 @@ inline bool _check_tensors_share_device_and_dtype( tensor.is_non_overlapping_and_dense(); }; - for (const auto& tensorList : tensorLists) { - for (const auto& tensor : tensorList) { - if (!is_tensor_okay(tensor)) { - return false; - } - } - } - - return true; + return std::all_of( + tensorLists.cbegin(), + tensorLists.cend(), + [&](const TensorList& tensorList) { + return std::all_of( + tensorList.cbegin(), tensorList.cend(), is_tensor_okay); + }); } // Helper function called in check_fast_path_restrictions to check if @@ -180,11 +155,9 @@ inline bool _check_tensors_do_type_promotion_with_scalars( bool does_op_promote_integer_inputs_to_float = false) { for (const auto i : c10::irange(tensorList.size())) { // For division, integer inputs will result in float. - if (does_op_promote_integer_inputs_to_float) { - if (at::isIntegralType( - tensorList[i].scalar_type(), /*includeBool*/ true)) { - return false; - } + if (does_op_promote_integer_inputs_to_float && + at::isIntegralType(tensorList[i].scalar_type(), /*includeBool*/ true)) { + return false; } if (!scalarList.empty()) { const auto& scalar = @@ -361,36 +334,34 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype( } }), "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding"); - if (!grouped_tensors_with_indices.count(key)) { - grouped_tensors_with_indices.insert( - {key, - TensorsAndIndicesT{ - [&]() -> nested_optional_tensorvec_t { - nested_optional_tensorvec_t nested_tensorvec; - nested_tensorvec.reserve(num_lists); - for (const auto& i : c10::irange(num_lists)) { - std::vector> tensors; - if (!nested_tensorlist[i].empty()) { - // NB: num_tensors is the max possible length for any of - // the inner lists of tensor references. Reserving the max - // trades memory for perf. This should not have significant - // impact. - tensors.reserve(num_tensors); - } - nested_tensorvec.emplace_back(tensors); - } - return nested_tensorvec; - }(), - [&]() -> IndicesT { - if (!with_indices) { - return {}; - } else { - IndicesT indices; - indices.reserve(num_tensors); - return indices; - } - }()}}); - } + grouped_tensors_with_indices.try_emplace( + key, + TensorsAndIndicesT{ + [&]() -> nested_optional_tensorvec_t { + nested_optional_tensorvec_t nested_tensorvec; + nested_tensorvec.reserve(num_lists); + for (const auto& i : c10::irange(num_lists)) { + std::vector> tensors; + if (!nested_tensorlist[i].empty()) { + // NB: num_tensors is the max possible length for any of + // the inner lists of tensor references. Reserving the max + // trades memory for perf. This should not have significant + // impact. + tensors.reserve(num_tensors); + } + nested_tensorvec.emplace_back(std::move(tensors)); + } + return nested_tensorvec; + }(), + [&]() -> IndicesT { + if (!with_indices) { + return {}; + } else { + IndicesT indices; + indices.reserve(num_tensors); + return indices; + } + }()}); for (const auto& list_index : c10::irange(num_lists)) { if (!nested_tensorlist[list_index].empty()) { grouped_tensors_with_indices[key].first[list_index].emplace_back( diff --git a/aten/src/ATen/native/GroupedMMUtils.h b/aten/src/ATen/native/GroupedMMUtils.h new file mode 100644 index 0000000000000..78993308cd5fa --- /dev/null +++ b/aten/src/ATen/native/GroupedMMUtils.h @@ -0,0 +1,167 @@ +#pragma once + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native { + +inline bool check_valid_strides_and_return_transposed(const Tensor& mat) { + IntArrayRef tensor_strides = mat.strides(); + IntArrayRef tensor_sizes = mat.sizes(); + int end_dim = mat.dim() - 1; + int alignment = 16 / mat.element_size(); + TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n"); + if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max(1, tensor_sizes[end_dim - 1]))) { + TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes"); + return true; + } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max(1, tensor_sizes[end_dim]))) { + TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes"); + return false; + } else { + TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes"); + } +} + +inline at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a, +const Tensor& mat_b, +const std::optional& offs, +c10::ScalarType out_dtype +) { + c10::SmallVector out_size; + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (a_is_2d) { + if (b_is_2d) { + out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)}; + } else { + TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match"); + out_size = {mat_a.size(0), mat_b.size(-1)}; + } + } else { + if (b_is_2d) { + // this case is not actually encountered for MoE gemms + TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match"); + out_size = {mat_a.size(1), mat_b.size(1)}; + } else { // regular bmm + TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match"); + out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)}; + } + } + + #ifndef USE_ROCM + // For TMA transfers, strides of output tensor have to be either + // 1, or aligned to 16 bytes. + const auto last_dim = out_size.size() - 1; + const auto alignment = 16 / c10::elementSize(out_dtype); + const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment; + std::vector out_stride; + if (a_is_2d != b_is_2d) { + out_stride = {size_padded, 1}; + } else { + out_stride = {out_size[1] * size_padded, size_padded, 1}; + } + return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype)); + #else + return at::empty(out_size, mat_a.options().dtype(out_dtype)); + #endif +} + +inline void _grouped_mm_validate_inputs(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype) { + TORCH_CHECK((mat_a.dtype() == at::kBFloat16) || (mat_a.dtype() == at::kFloat) || (mat_a.dtype() == at::kHalf), "Expected mat_a to be Float32, BFloat16 or Float16 matrix, got ", mat_a.scalar_type()); + TORCH_CHECK((mat_b.dtype() == at::kBFloat16) || (mat_b.dtype() == at::kFloat) || (mat_b.dtype() == at::kHalf), "Expected mat_b to be Float32, BFloat16 or Float16 matrix, got ", mat_b.scalar_type()); + TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); + TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (!a_is_2d || !b_is_2d) { + TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match"); + } + + // check that the strides are valid, the fn will throw an error if not + check_valid_strides_and_return_transposed(mat_a); + check_valid_strides_and_return_transposed(mat_b); + TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d"); + + if (offs.has_value()) { + TORCH_CHECK(offs->dim() == 1, "offs has to be 1D"); + TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); + } + TORCH_CHECK(!bias.has_value(), "Bias not supported yet"); +} + +inline c10::ScalarType _resolve_grouped_mm_out_dtype(const Tensor& mat_a, const Tensor& mat_b, +std::optional out_dtype) { + const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); + // TODO(future PR): enable float32 output dtype for bfloat16 and float16 inputs + TORCH_CHECK(out_dtype_ == mat_a.dtype(), "Grouped gemm output dtype must match `mat_a` dtype"); + return out_dtype_; +} + + +inline void _grouped_mm_fallback(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype, +Tensor out) { + LOG(INFO) << "fallback path for `torch._grouped_mm`, performance may not be optimal"; + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (a_is_2d && !b_is_2d) { + // 2d x 3d with offsets + int group_start_idx = 0; + auto offs_cpu = offs.value().cpu(); + for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) { + int group_end_idx = offs_cpu[group_idx].item(); + auto mat_a_slice = mat_a.slice(0, group_start_idx, group_end_idx); + auto out_slice = out.slice(0, group_start_idx, group_end_idx); + at::mm_out(out_slice, mat_a_slice, mat_b[group_idx]); + group_start_idx = group_end_idx; + } + + } else if (!a_is_2d && b_is_2d) { + // 3d x 2d with offsets + int group_start_idx = 0; + auto offs_cpu = offs.value().cpu(); + for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) { + int group_end_idx = offs_cpu[group_idx].item(); + auto mat_b_slice = mat_b.slice(1, group_start_idx, group_end_idx); + auto out_slice = out.slice(1, group_start_idx, group_end_idx); + at::mm_out(out_slice, mat_a[group_idx], mat_b_slice); + group_start_idx = group_end_idx; + } + + } else if (a_is_2d && b_is_2d) { + // 2d x 2d with offsets + int group_start_idx = 0; + auto offs_cpu = offs.value().cpu(); + for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) { + int group_end_idx = offs_cpu[group_idx].item(); + auto mat_a_slice = mat_a.slice(1, group_start_idx, group_end_idx); + auto mat_b_slice = mat_b.slice(0, group_start_idx, group_end_idx); + auto out_slice = out[group_idx]; + at::mm_out(out_slice, mat_a_slice, mat_b_slice); + group_start_idx = group_end_idx; + } + + } else { + // 3d x 3d without offsets - regular bmm + at::bmm_out(out, mat_a, mat_b); + } +} + + +} // namespace at::native diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 5d3a84ea39f6d..a744da3bcad2e 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -185,6 +185,17 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra // right: "lro, summed, ro" permuted with rpermutation and the three flattened // then the permuted output is a view of bmm(left, right) // finally, opermutation reverts the permutation to the original order of dimensions + // By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions. + // However, if all dimensions from the right operand appear before those from the left + // operand in the final output, we can swap the operands so that bmm directly produces + // the result in the correct memory order. + + bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front(); + if (swap_lo_ro) { + std::swap(left, right); + std::swap(lo, ro); + std::swap(lo_size, ro_size); + } auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size(); std::vector out_size; out_size.reserve(out_num_dim); diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 2d7c2ff067c69..b62c584641dba 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1360,6 +1360,7 @@ Tensor outer(const Tensor& self, const Tensor& vec2) { #endif +#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() static inline int64_t get_mkldnn_matmul_min_dim() { static auto value = [&] { const int64_t default_min_dim = [&] { @@ -1393,6 +1394,7 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) { const int64_t min_size = get_mkldnn_matmul_min_size(); return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size; } +#endif static void addmm_impl_cpu_( @@ -1771,6 +1773,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1])); }; +#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]); if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) { try { @@ -1781,6 +1784,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens at::globalContext().setUserEnabledMkldnn(false); } } +#endif if (contraction_size * res_rows * res_cols < 400) { if (is_bmm_out) { diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index 53d56622fe628..ca86292403fbf 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -47,10 +47,14 @@ TORCH_META_FUNC(nll_loss_forward) TORCH_CHECK( target.dim() <= 1, "0D or 1D target tensor expected, multi-target not supported"); - - auto no_batch_dim = self.dim() == 1 && target.dim() == 0; + if (self.dim() == 1 && target.dim() == 1) { + TORCH_CHECK_VALUE( + target.size(0) == 1, + "For 1D input, 1D target must have size 1, but got target size: ", + target.size(0)); + } TORCH_CHECK( - no_batch_dim || (self.size(0) == target.size(0)), + self.dim() == 1 || (self.size(0) == target.size(0)), "size mismatch (got input: ", self.sizes(), ", target: ", diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 710a6498d3963..ac1086c6b6bd3 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -537,10 +537,13 @@ BatchNormBackend _select_batch_norm_backend( } // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen - // See #64427 + // See https://github.com/pytorch/pytorch/issues/64427. // non static variable is used to be able to change environment variable in runtime for testing - // enabled by default for ROCm >= 7.0.0 - bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(ROCM_VERSION >= 70000); + // enabled by default for ROCm >= 7.0.0 with miopen 3.5 + int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0; + bool is_miopen_3_4 = miopen_version >= 30400; // ROCm 6.4 + bool is_miopen_3_5 = miopen_version >= 30500; // ROCm 7.0 + bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(is_miopen_3_5); if ( detail::getCUDAHooks().compiledWithMIOpen() @@ -549,17 +552,15 @@ BatchNormBackend _select_batch_norm_backend( && input.dim() <= MIOPEN_DIM_MAX && input.dim() >= 3 && input.scalar_type() != at::kDouble - && (detail::getCUDAHooks().versionMIOpen() >= 30400 || input.scalar_type() != at::kBFloat16) + && (is_miopen_3_4 || input.scalar_type() != at::kBFloat16) && weight.scalar_type() == at::kFloat // only FP32 weight for FP32 or FP16/BF16(mixed) input && weight.defined() && bias.defined() && ((running_mean.defined() && running_var.defined()) || (!running_mean.defined() && !running_var.defined() && training)) && (input.suggest_memory_format() == MemoryFormat::Contiguous -#if (defined(USE_ROCM) && ROCM_VERSION >= 60500) - || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM) - || (input.suggest_memory_format() == MemoryFormat::ChannelsLast3d && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM) -#endif - ) + || (is_miopen_3_5 && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM && + (input.suggest_memory_format() == MemoryFormat::ChannelsLast + || input.suggest_memory_format() == MemoryFormat::ChannelsLast3d))) ) { return BatchNormBackend::Miopen; } diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp index 2ac513bf08880..8833bdb6e471d 100644 --- a/aten/src/ATen/native/Onehot.cpp +++ b/aten/src/ATen/native/Onehot.cpp @@ -1,5 +1,6 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -24,8 +25,13 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) { if (num_classes == -1) { num_classes = self.max().item().toLong() + 1; } - at::Tensor index = at::arange(num_classes, self.options()); - return at::eq(self.unsqueeze(-1), index).to(kLong); + { + // If `self` is a DTensor, then allow implicit replication + // of the `index` Tensor. + at::DTensorAllowImplicitReplication guard; + at::Tensor index = at::arange(num_classes, self.options()); + return at::eq(self.unsqueeze(-1), index).to(kLong); + } } auto shape = self.sizes().vec(); diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp index 8072d24a1090d..8099648d37b29 100644 --- a/aten/src/ATen/native/PadNd.cpp +++ b/aten/src/ATen/native/PadNd.cpp @@ -240,8 +240,15 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod default: {} } } - C10_THROW_ERROR(NotImplementedError, - "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now"); + + std::ostringstream error_msg; + error_msg << "Padding size " << pad.size() << " is not supported for " << input_dim << "D input tensor.\n"; + error_msg << "Supported combinations for non-constant padding:\n"; + error_msg << " - 2D or 3D input: padding size = 2 (pads last dimension)\n"; + error_msg << " - 3D or 4D input: padding size = 4 (pads last 2 dimensions)\n"; + error_msg << " - 4D or 5D input: padding size = 6 (pads last 3 dimensions)"; + + C10_THROW_ERROR(NotImplementedError, error_msg.str()); } Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, std::string_view mode, std::optional value) { diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp index f4fdd395f013a..746d8c1a2db4f 100644 --- a/aten/src/ATen/native/QuantizedLinear.cpp +++ b/aten/src/ATen/native/QuantizedLinear.cpp @@ -411,7 +411,8 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) { Tensor fbgemm_linear_fp16_weight_fp32_activation( const Tensor& input, const Tensor& packed_weight, - const std::optional& bias) { + const std::optional& bias, + at::Tensor& output) { TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated " "and will be removed in a future PyTorch release.") @@ -436,9 +437,11 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) const int64_t M = size_to_dim_(input.dim() - 1, input.sizes()); const int64_t N = packed_weight_fp16.numCols(); + std::vector output_size = input.sizes().vec(); output_size.back() = N; - Tensor output = at::empty(output_size, input.options().dtype(at::kFloat)); + // Resize output Tensor + output.resize_(output_size); // Call the fp16 gemm interface fbgemm::cblas_gemm_compute( @@ -460,6 +463,14 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( return output; } +Tensor fbgemm_linear_fp16_weight_fp32_activation( + const Tensor& input, + const Tensor& packed_weight, + const std::optional& bias) { + at::Tensor output = at::empty({0}, input.options().dtype(at::kFloat)); + return at::native::fbgemm_linear_fp16_weight_fp32_activation(input, packed_weight, bias, output); + } + Tensor fbgemm_linear_fp16_weight( const Tensor& input, const Tensor& packed_weight, @@ -468,6 +479,15 @@ Tensor fbgemm_linear_fp16_weight( input, packed_weight, bias); } +Tensor fbgemm_linear_fp16_weight( + const Tensor& input, + const Tensor& packed_weight, + const Tensor& bias, + at::Tensor& output) { + return at::native::fbgemm_linear_fp16_weight_fp32_activation( + input, packed_weight, bias, output); +} + #else // USE_FBGEMM Tensor fbgemm_linear_int8_weight_fp32_activation( @@ -554,6 +574,21 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) { false, "This PyTorch installation was not built with FBGEMM operators"); } +Tensor fbgemm_linear_fp16_weight_fp32_activation( + const Tensor& input, + const Tensor& packed_weight, + const std::optional& bias, + at::Tensor& output) { + TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated " + "and will be removed in a future PyTorch release.") + + // We make a strong guarantee that models using these operators will have the + // same numerics across different machines. Therefore, we do not provide a + // fallback path and rather fail loudly if we cannot run FBGEMM. + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); +} + Tensor fbgemm_linear_fp16_weight_fp32_activation( const Tensor& input, const Tensor& packed_weight, @@ -568,6 +603,21 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( false, "This PyTorch installation was not built with FBGEMM operators"); } +Tensor fbgemm_linear_fp16_weight( + const Tensor& input, + const Tensor& packed_weight, + const Tensor& bias, + at::Tensor& output) { + TORCH_WARN_ONCE("fbgemm_linear_fp16_weight is deprecated " + "and will be removed in a future PyTorch release.") + + // We make a strong guarantee that models using these operators will have the + // same numerics across different machines. Therefore, we do not provide a + // fallback path and rather fail loudly if we cannot run FBGEMM. + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); +} + Tensor fbgemm_linear_fp16_weight( const Tensor& input, const Tensor& packed_weight, diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 5f9d5c85750b1..db046428bb683 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -220,6 +220,8 @@ static void check_argmax_argmin( const char* name, const Tensor& self, const std::optional& dim) { + TORCH_CHECK(!self.is_complex(), name, ": does not support complex input"); + TORCH_CHECK(!(self.scalar_type() == kBool), name, ": does not support bool input"); if (dim.has_value()) { auto dim_ = maybe_wrap_dim(dim.value(), self.dim()); native::zero_numel_check_dims(self, dim_, name); diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index 1bdc806a3b4ec..44215a26018f0 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -59,6 +59,8 @@ TORCH_META_FUNC(topk) "selected index k out of range"); int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim); TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension"); + TORCH_CHECK(!self.is_complex(), " topk does not support complex dtypes on CPU"); + TORCH_CHECK(!(self.scalar_type() == kBool), "topk does not support bool dtypes on CPU"); // Build the output size, which is the dim being selected set to // size k @@ -74,11 +76,7 @@ TORCH_META_FUNC2(sort, stable) (const Tensor& self, std::optional stable, int64_t dim, bool descending) { maybe_wrap_dim(dim, self.dim()); - const auto self_dtype = self.dtype(); - TORCH_CHECK_VALUE( - self_dtype != ScalarType::ComplexFloat && - self_dtype != ScalarType::ComplexDouble, - "Sort currently does not support complex dtypes on CPU."); + TORCH_CHECK(!self.is_complex(), " Sort does not support complex dtypes on CPU"); // See issue: https://github.com/pytorch/pytorch/issues/65863 // Strides should be dense, so as not to allocate too much memory. diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 054cc66cf8eb3..1886e65fc1edc 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -1640,6 +1640,9 @@ Tensor zeros_symint( std::optional layout, std::optional device, std::optional pin_memory) { + for (const auto& dim_size : size) { + TORCH_CHECK(dim_size >= 0, "zeros: Dimension size must be non-negative."); + } Layout layout_ = layout.value_or(Layout::Strided); if (at::sparse_csr::is_sparse_compressed(layout_)) { return zeros_sparse_compressed_symint( diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index 77acfe47363e4..4fa0556ad7859 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +58,12 @@ c10::SymInt sym_size(const Tensor& self, int64_t dim) { return self.sym_size(dim); } +c10::SymBool sym_is_contiguous( + const Tensor& self, + c10::MemoryFormat memory_format) { + return self.sym_is_contiguous(memory_format); +} + c10::SymInt sym_stride(const Tensor& self, int64_t dim) { return self.sym_stride(dim); } diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp index 2d300177a0533..a1a7059b7d64f 100644 --- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp @@ -139,7 +139,7 @@ struct Dist { static inline data_t map(const data_t& diff, const data_t& p) { return diff; } static inline data_t red(const data_t& agg, const data_t& up) { return max(agg, up); } static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; } - // TODO This backward pass uses a very complext expression to compute (diff + // TODO This backward pass uses a very complex expression to compute (diff // == dist) that could be much faster if using SSE instructions. static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return Vec(grad) * sign(diff) * (Vec(1) - vec::minimum(Vec(1), (diff.abs() - Vec(dist)).abs().ceil())); } }; diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h index 5715fd8f047f2..83b51a9985637 100644 --- a/aten/src/ATen/native/cpu/Loops.h +++ b/aten/src/ATen/native/cpu/Loops.h @@ -89,7 +89,7 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t using result_type = typename traits::result_type; for (; i < n; i++) { result_type* out_ptr = (result_type*)(data[0] + i * strides[0]); - *out_ptr = c10::guts::apply(op, dereference( + *out_ptr = std::apply(op, dereference( &data[1], &strides[1], i)); @@ -102,7 +102,7 @@ inline void execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { using traits = function_traits; for (; i < n; i++) { - c10::guts::apply(op, dereference( + std::apply(op, dereference( &data[0], &strides[0], i)); @@ -162,7 +162,7 @@ void handle_tuple_outputs(char* C10_RESTRICT data[], } // Loop operation for `cpu_kernel_multiple_outputs`. -// 1. Use `c10::guts::apply` to make dynamic method invocation +// 1. Use `std::apply` to make dynamic method invocation // for the lambda passed in `cpu_kernel_multiple_outputs`. // 2. Iterate over the members of the returned tuple, set the corresponding // output tensor by the tuple member in `handle_tuple_outputs` function. @@ -183,7 +183,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_ } for (; i < n; i++) { - auto output = c10::guts::apply(op, dereference( + auto output = std::apply(op, dereference( &data[num_outputs], &strides[num_outputs], i)); @@ -213,8 +213,8 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { auto args1 = dereference_vec(&data[1], opt_scalar, S, i); auto args2 = dereference_vec(&data[1], opt_scalar, S, i + Vec::size()); - auto out1 = c10::guts::apply(vop, std::move(args1)); - auto out2 = c10::guts::apply(vop, std::move(args2)); + auto out1 = std::apply(vop, std::move(args1)); + auto out2 = std::apply(vop, std::move(args2)); out1.store(data[0] + i * sizeof(scalar_t)); out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t)); } diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp index e3f08194bb58e..59d838b9782da 100644 --- a/aten/src/ATen/native/cpu/PaddingKernel.cpp +++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp @@ -156,7 +156,7 @@ void cpu_padding( int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0; int64_t offset_w = p.offsets[ndim - 1]; - // do vectorized copy whe output is overlapped with input on W, + // do vectorized copy when output is overlapped with input on W, // only applies to positive padding auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) { if (positive_padding) { diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp index 317647123d4c0..dac0f3bef25ee 100644 --- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp +++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -647,10 +648,10 @@ _vec_softmax( parallel_for( 0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) { int64_t idx = begin; - std::unique_ptr temp_vec_input(new float[dim_size*vectorized_step]()); - std::unique_ptr temp_vec_output(new float[dim_size*vectorized_step]()); - float* temp_vec_input_data = temp_vec_input.get(); - float* temp_vec_output_data = temp_vec_output.get(); + std::vector temp_vec_input(dim_size * vectorized_step); + std::vector temp_vec_output(dim_size * vectorized_step); + float* temp_vec_input_data = temp_vec_input.data(); + float* temp_vec_output_data = temp_vec_output.data(); while (idx < end) { int64_t outer_idx = idx / inner_size; int64_t inner_idx = idx % inner_size; diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp index 5a288193143d4..d013dfa0485e0 100644 --- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp @@ -318,7 +318,7 @@ batch_norm_cpu_collect_stats_channels_last_impl( // // The optimal THRESHOLD to tile was found empirically. // When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead. - // Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization. + // When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization. // // When num_threads == 1, always use Method 2 as there is no synchronization overhead. // diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 40d39b3c7b606..fcaae32e773f1 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1080,6 +1081,16 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals #endif } +static bool _grouped_mm_allowed_device() { +#ifdef USE_ROCM + return false; +#else + auto dprops = at::cuda::getCurrentDeviceProperties(); + // CUDA capability 8.0 and greater + return dprops->major >= 8; +#endif +} + #ifdef USE_ROCM static bool _scaled_mm_is_fnuz() { return at::detail::getCUDAHooks().isGPUArch({"gfx942"}); @@ -1289,21 +1300,30 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above"); } if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) { - TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e5m2 is only supported for ROCm 6.0 and above"); + TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e5m2 is only supported for ROCm 6.5 and above"); } if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) { - TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e4m3fn is only supported for ROCm 6.0 and above"); + TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e4m3fn is only supported for ROCm 6.5 and above"); } #endif if (bias) { - TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32"); - TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half, - "Bias must be either Half or BFloat16, but got ", bias->scalar_type()); - TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) || - bias->scalar_type() == ScalarType::BFloat16, - "Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type()); - TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half, - "Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type()); + TORCH_CHECK(out.scalar_type() != kFloat, + "Bias is not supported when out_dtype is set to Float32"); + + TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || + bias->scalar_type() == ScalarType::Half, + "Bias must be BFloat16 or Half, but got ", bias->scalar_type()); + + TORCH_CHECK((out.scalar_type() != kFloat && + out.scalar_type() != ScalarType::BFloat16) || + bias->scalar_type() == ScalarType::BFloat16, + "Bias must be BFloat16 to compute ", out.scalar_type(), + " output, but got ", bias->scalar_type()); + + TORCH_CHECK(out.scalar_type() != ScalarType::Half || + bias->scalar_type() == ScalarType::Half, + "Bias must be Float16 to compute ", out.scalar_type(), + " output, but got ", bias->scalar_type()); } { auto bias_ = bias.value_or(Tensor()); @@ -1339,7 +1359,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, // We are doing row-wise scaling auto dprops = at::cuda::getCurrentDeviceProperties(); if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise - && (dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)) { + && ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900) + // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales + || (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) { TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling."); at::cuda::detail::f8f8bf16_rowwise( mat1, @@ -1365,6 +1387,22 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16, "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type()); } + else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) { + #if ROCM_VERSION >= 70000 + TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}), + "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950"); + + TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 && + mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0, + "Matrix dimensions must be multiples of 32 for block-wise scaling"); + + TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 || + out.scalar_type() == ScalarType::Half, + "Block-wise scaling only supports BFloat16 or Half output types"); +#else + TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later"); +#endif + } #endif cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result, scaling_choice_a, scaling_choice_b); @@ -1442,12 +1480,14 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, params.k = args.k; params.a = args.mata->data_ptr(); params.a_scale_ptr = args.scale_mata_ptr; + params.a_scale_dtype = args.scale_mata_dtype.value(); params.lda = args.lda; params.a_dtype = args.mata->scalar_type(); params.a_scale_dtype = args.scale_mata_dtype.value(); params.a_scaling_type = args.scaling_mata_type.value(); params.b = args.matb->data_ptr(); params.b_scale_ptr = args.scale_matb_ptr; + params.b_scale_dtype = args.scale_matb_dtype.value(); params.ldb = args.ldb; params.b_dtype = args.matb->scalar_type(); params.b_scale_dtype = args.scale_matb_dtype.value(); @@ -1512,71 +1552,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, } namespace { - at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a, - const Tensor& mat_b, - const std::optional& offs, - std::optional out_dtype - ) { - c10::SmallVector out_size; - const bool a_is_2d = mat_a.dim() == 2; - const bool b_is_2d = mat_b.dim() == 2; - if (a_is_2d) { - if (b_is_2d) { - out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)}; - } else { - TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match"); - out_size = {mat_a.size(0), mat_b.size(-1)}; - } - } else { - if (b_is_2d) { - // this case is not actually encountered for MoE gemms - TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match"); - out_size = {mat_a.size(1), mat_b.size(1)}; - } else { // regular bmm - TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match"); - out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)}; - } - } - - const auto out_dtype_ = out_dtype.value_or(kBFloat16); - TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm"); - - #ifndef USE_ROCM - // For TMA transfers, strides of output tensor have to be either - // 1, or aligned to 16 bytes. - const auto last_dim = out_size.size() - 1; - const auto alignment = 16 / c10::elementSize(out_dtype_); - const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment; - std::vector out_stride; - if (a_is_2d != b_is_2d) { - out_stride = {size_padded, 1}; - } else { - out_stride = {out_size[1] * size_padded, size_padded, 1}; - } - return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_)); - #else - return at::empty(out_size, mat_a.options().dtype(out_dtype_)); - #endif - } - - bool check_valid_strides_and_return_transposed(const Tensor& mat) { - IntArrayRef tensor_strides = mat.strides(); - IntArrayRef tensor_sizes = mat.sizes(); - int end_dim = mat.dim() - 1; - int alignment = 16 / mat.element_size(); - TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n"); - if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max(1, tensor_sizes[end_dim - 1]))) { - TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes"); - return true; - } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max(1, tensor_sizes[end_dim]))) { - TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes"); - return false; - } else { - TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes"); - } - } - - void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) { + void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) { + // Checks scales for 2d or 3d target tensors (`mat`). if (mat.dim() == 2) { TORCH_CHECK( scale.dim() == 1, @@ -1610,9 +1587,66 @@ namespace { "scale must have the same first dimension as mat for arg ", arg_idx); } -} + } + void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) { + // Checks scales for 2d or 3d target tensors (`mat`). + if (mat.dim() == 2) { + // For MXFP8, 2d tensors have variable size groups represented as subtensors, + // that are converted to blocked padded format individually, + // so we can't check the scale sizes without doing a d2h sync to get the group sizes here. + TORCH_CHECK( + scale.dim() == mat.dim(), + "for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx); + + // LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4)) + // RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4)) + // * weight is transposed prior to the call, scale stays non-transposed. + bool LHS = arg_idx == 0; + int scale_dim_to_check = 0; + int mat_dim_to_check = LHS ? 0 : 1; + TORCH_CHECK( + scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check), + "for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ", + "must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")"); + } else { + // For MXFP8, 3d tensors have static group sizes (stack of 2d tensors), + // so we can check the exact expected scale sizes here without a d2h sync. + auto round_up = [](auto x, auto y) { + return ((x + y - 1) / y) * y; + }; + + // TODO: this is for 3d tensor in 2d-3d case specifically. + // We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them. + int64_t G = mat.size(0); + int64_t K = mat.size(1); + int64_t N = mat.size(2); + int64_t blocked_scale_K = round_up(K/32, 4); + int64_t blocked_scale_N = round_up(N, 128); + + // fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N). + TORCH_CHECK( + scale.dim() == mat.dim() - 1, + "for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx + ); + TORCH_CHECK( + scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N, + "for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx + ); + } + } + void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) { + bool using_fp8_rowwise = scale.scalar_type() == kFloat; + bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu; + if (using_fp8_rowwise) { + _check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier); + } else if (using_mxfp8) { + _check_scales_mxfp8(mat, scale, dim, arg_idx); + } else { + TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype()); + } + } } Tensor @@ -1637,8 +1671,8 @@ const std::optional& bias, const std::optional& scale_result, std::optional out_dtype, bool use_fast_accum) { - bool allowed_device = _scaled_mm_allowed_device(); - TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+"); + bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true); + TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+"); TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed"); TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed"); @@ -1671,16 +1705,47 @@ bool use_fast_accum) { TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); } - // Both Per-Tensor and Row-wise scaling expect fp32 tensors + // FP8 per-tensor and per-row scaling expect fp32 scales. + // MXFP8 expects float8_e8m0fnu scales. TORCH_CHECK( - scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat, - "Both scale_a and scale_b must be float (fp32) tensors."); + (scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) || + (scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu), + "For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors."); const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1; check_scale(mat_a, scale_a, 0 ,0, scale_multiplier); check_scale(mat_b, scale_b, 1, 1, scale_multiplier); - Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype); + const auto out_dtype_ = out_dtype.value_or(kBFloat16); + TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm"); + + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + +#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM) + // MXFP8 grouped GEMM dispatching + bool is_mx8mx8bf16 = ( + mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn && + scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu + ); + TORCH_CHECK(out_dtype == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm"); + + if (is_mx8mx8bf16) { + bool b_is_3d = mat_b.dim() == 3; + bool is_2d_2d = a_is_2d && b_is_2d; + bool is_2d_3d = a_is_2d && b_is_3d; + TORCH_CHECK(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases"); + TORCH_CHECK(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets"); + + fbgemm_gpu::mx8mx8bf16_grouped_mm( + mat_a, + mat_b, + scale_a, + scale_b, + offs.value(), + out); + return out; + } +#endif #ifndef USE_ROCM TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type()); @@ -1713,6 +1778,7 @@ bool use_fast_accum) { #else TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM") #endif + #endif } @@ -1722,33 +1788,21 @@ const std::optional& offs, const std::optional& bias, std::optional out_dtype) { #ifndef USE_ROCM - bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true); - TORCH_CHECK(allowed_device, "torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0, 10.0"); - - TORCH_CHECK(mat_a.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_a.scalar_type()); - TORCH_CHECK(mat_b.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_b.scalar_type()); - TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); - TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); - const bool a_is_2d = mat_a.dim() == 2; - const bool b_is_2d = mat_b.dim() == 2; - if (!a_is_2d || !b_is_2d) { - TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match"); - } - - // check that the strides are valid, the fn will throw an error if not - check_valid_strides_and_return_transposed(mat_a); - check_valid_strides_and_return_transposed(mat_b); - TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d"); - - if (offs.has_value()) { - TORCH_CHECK(offs->dim() == 1, "offs has to be 1D"); - TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); + _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype); + bool a_b_and_out_are_bf16 = ( + mat_a.dtype() == at::kBFloat16 && + mat_b.dtype() == at::kBFloat16 && + out_dtype.value_or(at::kBFloat16) == at::kBFloat16 + ); + bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16; + const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + if (use_fast_path) { + // fast path, no d2h sync needed + at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out); + } else { + _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); } - TORCH_CHECK(!bias.has_value(), "Bias not supported yet"); - - Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype); - - at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out); return out; #else TORCH_CHECK(false, "grouped gemm is not supported on ROCM") diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index 16acbe0b8bf2d..12ad84a15b180 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -436,7 +436,6 @@ static inline void launch_vectorized_templated_kernel( loader_t l, storer_t s) { TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); - using traits = function_traits; int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) / vectorized_templated_config::block_work_size(); auto stream = at::cuda::getCurrentCUDAStream(); diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu index b5908cc0abcfc..c6d3c25200d50 100644 --- a/aten/src/ATen/native/cuda/LossCTC.cu +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -644,7 +644,12 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ Tensor grad = at::full_like(log_probs, neginf, LEGACY_CONTIGUOUS_MEMORY_FORMAT); // initialization for log(sum (alpha beta)) // As above, there may be better configurations to use. - constexpr int max_threads = std::is_same_v ? 1024 : 896; // we need 72 or so 32 bit registers for double + constexpr int max_threads_ = std::is_same_v ? 1024 : 896; // we need 72 or so 32 bit registers for double + int max_threads = max_threads_; + // Blackwell launch bounds + if (at::cuda::getCurrentDeviceProperties()->major >= 10) { + max_threads = 512; + } int threads_target = max_threads; while (threads_target / 2 >= 2*max_target_length+1) { threads_target /= 2; diff --git a/aten/src/ATen/native/cuda/Pow.cuh b/aten/src/ATen/native/cuda/Pow.cuh index dc9faf77f22a3..fe249c1cdaef3 100644 --- a/aten/src/ATen/native/cuda/Pow.cuh +++ b/aten/src/ATen/native/cuda/Pow.cuh @@ -14,7 +14,7 @@ namespace { // pow(double, int) // pow(float, float) // pow(double, double) -#ifdef _MSC_VER +#if defined(_MSC_VER) || defined(_LIBCPP_VERSION) // Functions for pow // pow for at::Half static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) { diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu index 3acb359342f13..c6f88692a8a5c 100644 --- a/aten/src/ATen/native/cuda/SegmentReduce.cu +++ b/aten/src/ATen/native/cuda/SegmentReduce.cu @@ -20,7 +20,7 @@ // SegmentReduce compilation with CUDA-12.9 causes NVCC crash on Windows // See https://github.com/pytorch/pytorch/issues/156181 -#if !defined(_WIN32) || CUDART_VERSION < 12090 +#if !(defined(_WIN32) && CUDART_VERSION == 12090) namespace at::native { @@ -606,4 +606,4 @@ REGISTER_DISPATCH( } // namespace at::native -#endif +#endif \ No newline at end of file diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu index 272eb9b9c564f..5444bb57eba7c 100644 --- a/aten/src/ATen/native/cuda/int4mm.cu +++ b/aten/src/ATen/native/cuda/int4mm.cu @@ -1304,7 +1304,7 @@ at::Tensor _convert_weight_to_int4pack_cuda( constexpr int32_t kKTileSize = 16; // GPT-FAST assumes nTileSize of 8 for quantized weight tensor. - // See https://github.com/pytorch-labs/gpt-fast/blob/091515ab5b06f91c0d6a3b92f9c27463f738cc9b/quantize.py#L510 + // See https://github.com/meta-pytorch/gpt-fast/blob/091515ab5b06f91c0d6a3b92f9c27463f738cc9b/quantize.py#L510 // Torch dynamo also requires the torch ops has the same output shape for each device. // See https://github.com/pytorch/pytorch/blob/ec284d3a74ec1863685febd53687d491fd99a161/torch/_meta_registrations.py#L3263 constexpr int32_t kNTileSizeTensor = 8; diff --git a/aten/src/ATen/native/cuda/int8mm.cu b/aten/src/ATen/native/cuda/int8mm.cu new file mode 100644 index 0000000000000..60f64cd9fc203 --- /dev/null +++ b/aten/src/ATen/native/cuda/int8mm.cu @@ -0,0 +1,74 @@ +#include +#include +#include +#include + +namespace at::native { + +__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) { + // one thread per output element: [B, N] + int b = blockIdx.y * blockDim.y + threadIdx.y; + int n = blockIdx.x * blockDim.x + threadIdx.x; + + if (b >= B || n >= N) return; + + float acc = 0.0f; + for (int k = 0; k < K; ++k) { + acc += x[b * K + k] * static_cast(w[n * K + k]); + } + + out[b * N + n] = acc * scale[n]; +} + +void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) { + const int B = x.size(0); + const int K = x.size(1); + const int N = w_int8.size(0); + + const dim3 block(16, 16); + const dim3 grid((N + block.x - 1) / block.x, (B + block.y - 1) / block.y); + + auto stream = at::cuda::getCurrentCUDAStream(); + + weight_int8pack_mm_kernel<<>>( + x.data_ptr(), + w_int8.data_ptr(), + scale.data_ptr(), + out.data_ptr(), + B, K, N); +} + + +// Main GPU entry point +at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) { + // --- Check inputs --- + TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor"); + TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor"); + TORCH_CHECK(scale.is_cuda(), "scale must be a CUDA tensor"); + + TORCH_CHECK(x.dim() == 2, "x must be 2D"); + TORCH_CHECK(w_int8.dim() == 2, "w must be 2D"); + TORCH_CHECK(scale.dim() == 1, "scale must be 1D"); + + TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)"); + TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)"); + + // --- Determine shapes --- + auto B = x.size(0); // batch size + auto N = w_int8.size(0); // output dim + + // Ensure inputs are in the correct types for the kernel + auto x_f32 = x.to(at::kFloat); + auto w_int8_contiguous = w_int8.contiguous(); + auto scale_f32 = scale.to(at::kFloat); + + // --- Allocate output --- + auto out = at::empty({B, N}, x.options().dtype(at::kFloat)); + + // --- Launch kernel --- + launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out); + + return out; +} + +} // namespace at::native diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp index 4d869e5679f8a..081b4afa15ac5 100644 --- a/aten/src/ATen/native/cudnn/Conv_v7.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp @@ -285,7 +285,7 @@ struct algorithm_search { sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution forward algorithms"); int perf_count; - std::unique_ptr perf_results(new perf_t[num_algos]); + c10::SmallVector perf_results; if (!benchmark) { AT_CUDNN_CHECK_WITH_SHAPES( cudnnGetConvolutionForwardAlgorithm_v7( @@ -296,7 +296,7 @@ struct algorithm_search { args.odesc.desc(), num_algos, &perf_count, - perf_results.get()), + perf_results.data()), args); } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); @@ -314,7 +314,7 @@ struct algorithm_search { args.output.data_ptr(), num_algos, &perf_count, - perf_results.get(), + perf_results.data(), ws.data, ws.size), args); @@ -324,7 +324,7 @@ struct algorithm_search { // memory, e.g. a few GBs. c10::cuda::CUDACachingAllocator::emptyCache(); } - return getValidAlgorithms(perf_results.get(), args, perf_count); + return getValidAlgorithms(perf_results.data(), args, perf_count); } static void getWorkspaceSize( @@ -369,7 +369,8 @@ struct algorithm_search { sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward data algorithms."); int perf_count; - std::unique_ptr perf_results(new perf_t[num_algos]); + c10::SmallVector + perf_results; if (!benchmark) { AT_CUDNN_CHECK_WITH_SHAPES( cudnnGetConvolutionBackwardDataAlgorithm_v7( @@ -380,7 +381,7 @@ struct algorithm_search { args.idesc.desc(), num_algos, &perf_count, - perf_results.get()), + perf_results.data()), args); } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); @@ -398,7 +399,7 @@ struct algorithm_search { args.input.data_ptr(), num_algos, &perf_count, - perf_results.get(), + perf_results.data(), ws.data, ws.size), args); @@ -408,7 +409,7 @@ struct algorithm_search { // memory, e.g. a few GBs. c10::cuda::CUDACachingAllocator::emptyCache(); } - return getValidAlgorithms(perf_results.get(), args, perf_count); + return getValidAlgorithms(perf_results.data(), args, perf_count); } static void getWorkspaceSize( @@ -456,7 +457,8 @@ struct algorithm_search { static_assert( sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward filter algorithms."); - std::unique_ptr perf_results(new perf_t[num_algos]); + c10::SmallVector + perf_results; int perf_count; if (!benchmark) { AT_CUDNN_CHECK_WITH_SHAPES( @@ -468,7 +470,7 @@ struct algorithm_search { args.wdesc.desc(), num_algos, &perf_count, - perf_results.get()), + perf_results.data()), args); } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); @@ -486,7 +488,7 @@ struct algorithm_search { args.weight.data_ptr(), num_algos, &perf_count, - perf_results.get(), + perf_results.data(), ws.data, ws.size), args); @@ -496,7 +498,7 @@ struct algorithm_search { // memory, e.g. a few GBs. c10::cuda::CUDACachingAllocator::emptyCache(); } - return getValidAlgorithms(perf_results.get(), args, perf_count); + return getValidAlgorithms(perf_results.data(), args, perf_count); } static void getWorkspaceSize( diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp index 48119a6a3b4c3..c2f7ce2ac2d53 100644 --- a/aten/src/ATen/native/cudnn/MHA.cpp +++ b/aten/src/ATen/native/cudnn/MHA.cpp @@ -2,9 +2,13 @@ #include #include -#if defined(USE_ROCM) || !AT_CUDNN_ENABLED() || \ - (defined(CUDNN_VERSION) && CUDNN_VERSION < 8900) +#if AT_CUDNN_ENABLED() +#include +#endif +#if defined(USE_ROCM) || !AT_CUDNN_ENABLED() || \ + (defined(CUDNN_VERSION) && CUDNN_VERSION < 8900) || \ + (defined(CUDNN_FRONTEND_VERSION) && CUDNN_FRONTEND_VERSION < 10100) namespace at { namespace native { @@ -84,6 +88,37 @@ void run_cudnn_SDP_bprop( false, "PyTorch was not compiled with cuDNN Flash Attention enabled!"); } +void run_cudnn_SDP_bprop_nestedtensor( + int64_t b, + int64_t h_q, + int64_t h_k, + int64_t h_v, + int64_t s_q, + int64_t s_kv, + int64_t d_qk, + int64_t d_v, + + float scaling_factor, + bool is_causal, + float dropout_probability, + const Tensor& cum_seqlen_q, + const Tensor& cum_seqlen_kv, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const std::optional& attn_bias, + const Tensor& o, + const Tensor& dO, + const Tensor& softmaxstats, + Tensor& dQ, + Tensor& dK, + Tensor& dV, + const Tensor& dropoutseed, + const Tensor& dropoutoffset) { + TORCH_CHECK( + false, "PyTorch was not compiled with cuDNN Flash Attention enabled!"); +} + } // namespace native } // namespace at @@ -95,7 +130,6 @@ void run_cudnn_SDP_bprop( #include #include -#include #include #include @@ -111,42 +145,58 @@ namespace native { #include namespace fe = cudnn_frontend; -using graph_and_tensors = std::tuple< - std::shared_ptr, - std::shared_ptr, // Q, - std::shared_ptr, // K, - std::shared_ptr, // V, - std::optional>, // Bias - std::shared_ptr, // Attn_scale, - // TODO(eqy): additional options - // std::shared_ptr, // SEQ_LEN_Q, - // std::shared_ptr, // SEQ_LEN_KV, - std::shared_ptr, // Seed, - std::shared_ptr, // Offset, - // std::shared_ptr, // Dropout_mask, - // std::shared_ptr, // Dropout_scale - std::shared_ptr, // O - std::shared_ptr // Stats - >; - -using graph_and_tensors_backward = std::tuple< - std::shared_ptr, - std::shared_ptr, // Q, - std::shared_ptr, // K, - std::shared_ptr, // V, - std::optional>, // Bias, - std::shared_ptr, // Attn_scale, - std::shared_ptr, // Seed, - std::shared_ptr, // Offset, - std::shared_ptr, // O, - std::shared_ptr, // dO, - std::shared_ptr, // stats, - std::shared_ptr, // dQ, - std::shared_ptr, // dK,, - std::shared_ptr // dV, - >; - -#define MAX_MHA_DIM 4 + +constexpr uint8_t MAX_MHA_DIM = 4; + +// Whether we will use ragged offsets in the dense (non-nested) path +// to avoid recompilation +bool use_ragged_in_dense( + const Tensor& q, + const Tensor& k, + const Tensor& v, + const Tensor& o, + bool has_bias) { + static bool flag = + c10::utils::check_env("TORCH_CUDNN_SDPA_AVOID_RECOMPILE") == true; + if (!flag) { + return flag; + } + TORCH_WARN_ONCE( + "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 is currently experimental. " + "Please report any issues to https://github.com/pytorch/pytorch/issues."); + if (has_bias) { + TORCH_WARN_ONCE( + "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works without bias." + "Consider using the is_causal hint instead of bias for causal masking." + "Falling back to regular dense case, which may trigger excessive recompilation."); + return !has_bias; + } + bool all_bshd = q.dim() == 4 && q.transpose(1, 2).is_contiguous() && + k.dim() == 4 && k.transpose(1, 2).is_contiguous() && v.dim() == 4 && + v.transpose(1, 2).is_contiguous() && o.dim() == 4 && + o.transpose(1, 2).is_contiguous(); + if (!all_bshd) { + TORCH_WARN_ONCE( + "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout," + "e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)." + "Falling back to regualr dense case, which may trigger excessive recompilation."); + } + return all_bshd; +} + +int roundup_power2(int dim) { + if (!dim) { + return 1; + } + dim--; + dim |= dim >> 1; + dim |= dim >> 2; + dim |= dim >> 4; + dim |= dim >> 8; + dim |= dim >> 16; + dim++; + return dim; +} struct MHAParams { c10::DeviceIndex device_id; @@ -171,6 +221,7 @@ struct MHAParams { // might be redundant if we take 0 dim/stride // as signaling no-bias bool has_attn_bias; + bool use_ragged; }; void setMHAParams( @@ -187,7 +238,8 @@ void setMHAParams( const std::optional& attn_bias, double dropout_probability, bool is_causal, - bool return_softmaxstats) { + bool return_softmaxstats, + bool is_nested) { memset(¶ms, 0, sizeof(MHAParams)); params.device_id = at::cuda::current_device(); params.dataType = fe::DataType_t::HALF; @@ -204,23 +256,24 @@ void setMHAParams( params.is_causal = is_causal; params.return_softmaxstats = return_softmaxstats; params.has_attn_bias = attn_bias.has_value(); + // Expect 4D dense tensor, 3D nested case (THD) TORCH_INTERNAL_ASSERT( - q.sizes().size() == MAX_MHA_DIM, + q.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "Q tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - q.strides().size() == MAX_MHA_DIM, + q.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "Q tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - k.sizes().size() == MAX_MHA_DIM, + k.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "K tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - k.strides().size() == MAX_MHA_DIM, + k.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "K tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - v.sizes().size() == MAX_MHA_DIM, + v.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "V tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - v.strides().size() == MAX_MHA_DIM, + v.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "V tensor has unexpected number of dims, please report a bug to PyTorch."); std::copy(q.sizes().begin(), q.sizes().end(), params.q_dim.begin()); std::copy(q.strides().begin(), q.strides().end(), params.q_stride.begin()); @@ -228,6 +281,20 @@ void setMHAParams( std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin()); std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin()); std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin()); + bool use_ragged = use_ragged_in_dense(q, k, v, q, params.has_attn_bias); + params.use_ragged = use_ragged; + if (use_ragged) { + // ignore B - stride in BSHD (THD) avoid-recompile + params.q_stride[0] = INT_MAX; + params.k_stride[0] = INT_MAX; + params.v_stride[0] = INT_MAX; + // fix seqlen to rounded value + params.s_q = roundup_power2(params.s_q); + params.s_kv = roundup_power2(params.s_kv); + params.q_dim[2] = roundup_power2(params.q_dim[2]); + params.k_dim[2] = roundup_power2(params.k_dim[2]); + params.v_dim[2] = roundup_power2(params.v_dim[2]); + } // uninit is OK as the struct is memset 0'd if (params.has_attn_bias) { std::copy( @@ -255,7 +322,8 @@ struct MHACacheKeyWrapper : ParamsWrapper { const std::optional& attn_bias, double dropout_probability, bool is_causal, - bool return_softmaxstats) { + bool return_softmaxstats, + bool is_nested) { setMHAParams( this->pod, b, @@ -270,22 +338,37 @@ struct MHACacheKeyWrapper : ParamsWrapper { attn_bias, dropout_probability, is_causal, - return_softmaxstats); + return_softmaxstats, + is_nested); } }; template struct MHAGraphCache { std::unordered_map> engine_cache; + int count = 0; + int hits = 0; // no mutexes here as caches are now thread local for v8, can also return a // pointer to the Execution Plan if we know it will not be invalidated by // another thread T* find(const KeyType& key) { + static bool flag = + c10::utils::check_env("TORCH_CUDNN_SDPA_CACHE_DEBUG") == true; + if (flag && count) { + TORCH_WARN( + "SDPA Cache Called ", + count, + " times. Hit rate: ", + 100 * hits / count, + "%"); + } + count++; auto it = engine_cache.find(key); if (it == engine_cache.end()) { return nullptr; } + hits++; return &(it->second); } @@ -298,11 +381,45 @@ struct MHAGraphCache { // @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to // be thread safe across all engines see Limitations in // https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html -thread_local MHAGraphCache mhagraphcache; -thread_local MHAGraphCache - mhagraphbackwardcache; +// We also leak the caches to workaround potential teardown race issues. + +auto& getMHAGraphCache_() { + thread_local auto& instance = + *new MHAGraphCache, MHACacheKeyWrapper>; + return instance; +} + +auto& getMHAGraphBackwardCache_() { + thread_local auto& instance = + *new MHAGraphCache, MHACacheKeyWrapper>; + return instance; +} namespace { + +enum UIDS { + Q, + K, + V, + O, + BIAS, + SCALE, + SEED, + OFFSET, + LSE, + DO, + DQ, + DK, + DV, + SEQ_LEN_Q, + SEQ_LEN_KV, + RAG_Q_OFF, + RAG_K_OFF, + RAG_V_OFF, + RAG_O_OFF, + RAG_LSE_OFF +}; + // analogous to the same function in Descriptors.h for cuDNN Convolutions... auto fixSizeOneDimStrideSDPA( const IntArrayRef sizes, @@ -320,9 +437,10 @@ auto fixSizeOneDimStrideSDPA( } return strides; } + } // namespace -auto build_graph_and_tensors( +auto build_graph( int64_t b, int64_t h, int64_t s_q, @@ -355,65 +473,162 @@ auto build_graph_and_tensors( .set_compute_data_type(fe::DataType_t::FLOAT); auto attn_scale = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SCALE) .set_name("Attn_scale") .set_dim({1, 1, 1, 1}) .set_stride({1, 1, 1, 1}) .set_is_pass_by_value(true) .set_data_type(fe::DataType_t::FLOAT)); - auto seed = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("Seed") - .set_dim({1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_data_type( - dropoutseed.dtype() == kInt - ? fe::DataType_t::INT32 - : fe::DataType_t::INT64)); - auto offset = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("Offset") + auto scaled_dot_product_flash_attention_options = + fe::graph::SDPA_attributes() + .set_name("CUDNN_SDPA") + .set_generate_stats(return_softmaxstats) + .set_causal_mask(is_causal) + .set_attn_scale(attn_scale); + if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { + auto SEQ_LEN_Q_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEQ_LEN_Q) + .set_name("Seq_q") + .set_dim({b, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto SEQ_LEN_KV_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEQ_LEN_KV) + .set_name("Seq_kv") + .set_dim({b, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + scaled_dot_product_flash_attention_options.set_seq_len_q(SEQ_LEN_Q_) + .set_seq_len_kv(SEQ_LEN_KV_) + .set_padding_mask(true); + } + if (dropout_probability != 0.0f) { + auto seed = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEED) + .set_name("Seed") .set_dim({1, 1, 1, 1}) .set_stride({1, 1, 1, 1}) .set_data_type( - dropoutoffset.dtype() == kInt + dropoutseed.dtype() == kInt ? fe::DataType_t::INT32 : fe::DataType_t::INT64)); - auto scaled_dot_product_flash_attention_options = - fe::graph::SDPA_attributes() - .set_name("CUDNN_SDPA") - .set_is_inference(return_softmaxstats == false) - .set_causal_mask(is_causal) - .set_attn_scale(attn_scale) - .set_dropout(dropout_probability, seed, offset); - auto Q = mha_graph->tensor( - fe::graph::Tensor_attributes() - .set_name("Q") - .set_dim(q.sizes().vec()) - .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec()))); - auto K = mha_graph->tensor( - fe::graph::Tensor_attributes() - .set_name("K") - .set_dim(k.sizes().vec()) - .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec()))); - auto V = mha_graph->tensor( - fe::graph::Tensor_attributes() - .set_name("V") - .set_dim(v.sizes().vec()) - .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec()))); + auto offset = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(OFFSET) + .set_name("Offset") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type( + dropoutoffset.dtype() == kInt + ? fe::DataType_t::INT32 + : fe::DataType_t::INT64)); + scaled_dot_product_flash_attention_options.set_dropout( + dropout_probability, seed, offset); + } + auto Q_ = mha_graph->tensor( + fe::graph::Tensor_attributes().set_uid(Q).set_name("Q")); + auto K_ = mha_graph->tensor( + fe::graph::Tensor_attributes().set_uid(K).set_name("K")); + auto V_ = mha_graph->tensor( + fe::graph::Tensor_attributes().set_uid(V).set_name("V")); std::optional> bias; if (attn_bias.has_value()) { bias = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(BIAS) .set_name("bias") .set_dim(attn_bias.value().sizes().vec()) .set_stride(attn_bias.value().strides().vec())); scaled_dot_product_flash_attention_options.set_bias(bias.value()); } - auto [O, Stats] = - mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options); - O->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec()); - + auto [O_, Stats] = + mha_graph->sdpa(Q_, K_, V_, scaled_dot_product_flash_attention_options); + O_->set_uid(O).set_output(true); if (Stats) { - Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT); + Stats->set_uid(LSE) + .set_output(true) + .set_data_type(fe::DataType_t::FLOAT) + .set_stride(softmaxstats.strides().vec()); + } + if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { + auto RAG_Q_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_Q_OFF) + .set_name("cum_seq_q") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_K_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_K_OFF) + .set_name("cum_seq_k") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_V_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_V_OFF) + .set_name("cum_seq_v") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_O_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_O_OFF) + .set_name("cum_seq_o") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_STATS_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_LSE_OFF) + .set_name("cum_seq_stats") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + O_->set_ragged_offset(RAG_O_OFF_); + Q_->set_ragged_offset(RAG_Q_OFF_); + K_->set_ragged_offset(RAG_K_OFF_); + V_->set_ragged_offset(RAG_V_OFF_); + auto qsizevec = q.sizes().vec(); + auto ksizevec = k.sizes().vec(); + auto vsizevec = v.sizes().vec(); + auto osizevec = o.sizes().vec(); + qsizevec[2] = roundup_power2(qsizevec[2]); + ksizevec[2] = roundup_power2(ksizevec[2]); + vsizevec[2] = roundup_power2(vsizevec[2]); + osizevec[2] = roundup_power2(osizevec[2]); + // we checked for BSHD contig., set fake strides as cuDNN will complain + // if e.g., a ragged dim is smaller than a non-ragged one: + // consider HBSD tensor where H is 1 + Q_->set_dim(qsizevec).set_stride( + {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1}); + K_->set_dim(ksizevec).set_stride( + {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1}); + V_->set_dim(vsizevec).set_stride( + {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1}); + O_->set_dim(osizevec).set_stride( + {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1}); + if (Stats) { + Stats->set_ragged_offset(RAG_STATS_OFF_); + auto statssizevec = softmaxstats.sizes().vec(); + statssizevec[2] = roundup_power2(statssizevec[2]); + Stats->set_dim(statssizevec); + } + } else { + Q_->set_dim(q.sizes().vec()) + .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())); + K_->set_dim(k.sizes().vec()) + .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())); + V_->set_dim(v.sizes().vec()) + .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())); + O_->set_dim(o.sizes().vec()) + .set_stride(fixSizeOneDimStrideSDPA(o.sizes(), o.strides().vec())); + if (Stats) { + Stats->set_dim(softmaxstats.sizes().vec()); + } } AT_CUDNN_FRONTEND_CHECK(mha_graph->validate()); @@ -423,20 +638,10 @@ auto build_graph_and_tensors( AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle)); AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle)); - return std::make_tuple( - std::move(mha_graph), - std::move(Q), - std::move(K), - std::move(V), - std::move(bias), - std::move(attn_scale), - std::move(seed), - std::move(offset), - std::move(O), - std::move(Stats)); + return mha_graph; } -auto build_graph_and_tensors_nestedtensor( +auto build_graph_nestedtensor( int64_t b, int64_t h_q, int64_t h_k, @@ -473,28 +678,22 @@ auto build_graph_and_tensors_nestedtensor( .set_compute_data_type(fe::DataType_t::FLOAT); auto attn_scale = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SCALE) .set_name("Attn_scale") .set_dim({1, 1, 1, 1}) .set_stride({1, 1, 1, 1}) .set_is_pass_by_value(true) .set_data_type(fe::DataType_t::FLOAT)); - auto seed = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("Seed") - .set_dim({1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_data_type(fe::DataType_t::INT32)); - auto offset = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("Offset") - .set_dim({1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_data_type(fe::DataType_t::INT32)); - auto SEQ_LEN_Q = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("Seq_q") - .set_dim({b, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_data_type(fe::DataType_t::INT32)); - auto SEQ_LEN_KV = + auto SEQ_LEN_Q_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEQ_LEN_Q) + .set_name("Seq_q") + .set_dim({b, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto SEQ_LEN_KV_ = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEQ_LEN_KV) .set_name("Seq_kv") .set_dim({b, 1, 1, 1}) .set_stride({1, 1, 1, 1}) @@ -503,44 +702,69 @@ auto build_graph_and_tensors_nestedtensor( auto scaled_dot_product_flash_attention_options = fe::graph::SDPA_attributes() .set_name("CUDNN_SDPA_NESTEDTENSOR") - .set_is_inference(return_softmaxstats == false) + .set_generate_stats(return_softmaxstats) .set_causal_mask(is_causal) .set_attn_scale(attn_scale) - .set_dropout(dropout_probability, seed, offset) - .set_seq_len_q(SEQ_LEN_Q) - .set_seq_len_kv(SEQ_LEN_KV) + .set_seq_len_q(SEQ_LEN_Q_) + .set_seq_len_kv(SEQ_LEN_KV_) .set_padding_mask(true); + if (dropout_probability != 0.0f) { + auto seed = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEED) + .set_name("Seed") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type( + dropoutseed.dtype() == kInt + ? fe::DataType_t::INT32 + : fe::DataType_t::INT64)); + auto offset = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(OFFSET) + .set_name("Offset") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type( + dropoutoffset.dtype() == kInt + ? fe::DataType_t::INT32 + : fe::DataType_t::INT64)); + scaled_dot_product_flash_attention_options.set_dropout( + dropout_probability, seed, offset); + } // We hardcode BSHD to cuDNN even though the underlying layout is THD auto q_strides = q.strides(); auto k_strides = k.strides(); auto v_strides = v.strides(); + // NB: cuDNN API shape is transposed: we pass it nominally as HTD constexpr int strideidx0 = 1; constexpr int strideidx1 = 0; constexpr int strideidx2 = 2; - auto Q = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("Q") - .set_dim({b, h_q, s_q, d_qk}) - .set_stride( - {INT_MAX, - q_strides[strideidx0], - q_strides[strideidx1], - q_strides[strideidx2]})); - auto K = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("K") - .set_dim({b, h_k, s_kv, d_qk}) - .set_stride( - {INT_MAX, - k_strides[strideidx0], - k_strides[strideidx1], - k_strides[strideidx2]})); - auto V = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("V") - .set_dim({b, h_v, s_kv, d_v}) - .set_stride( - {INT_MAX, - v_strides[strideidx0], - v_strides[strideidx1], - v_strides[strideidx2]})); + auto Q_ = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(Q) + .set_name("Q") + .set_dim({b, h_q, s_q, d_qk}) + .set_stride( + {INT_MAX, + q_strides[strideidx0], + q_strides[strideidx1], + q_strides[strideidx2]})); + auto K_ = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(K) + .set_name("K") + .set_dim({b, h_k, s_kv, d_qk}) + .set_stride( + {INT_MAX, + k_strides[strideidx0], + k_strides[strideidx1], + k_strides[strideidx2]})); + auto V_ = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(V) + .set_name("V") + .set_dim({b, h_v, s_kv, d_v}) + .set_stride( + {INT_MAX, + v_strides[strideidx0], + v_strides[strideidx1], + v_strides[strideidx2]})); std::optional> bias; if (attn_bias.has_value()) { TORCH_CHECK( @@ -548,44 +772,48 @@ auto build_graph_and_tensors_nestedtensor( "attn_bias not yet supportd with cuDNN Attention and NestedTensor"); bias = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(BIAS) .set_name("bias") .set_dim(attn_bias.value().sizes().vec()) .set_stride(attn_bias.value().strides().vec())); scaled_dot_product_flash_attention_options.set_bias(bias.value()); } - auto RAG_Q_OFF = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("cum_seq_q") - .set_dim({b + 1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_data_type(fe::DataType_t::INT32)); - auto RAG_K_OFF = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("cum_seq_k") - .set_dim({b + 1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_data_type(fe::DataType_t::INT32)); - auto RAG_V_OFF = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("cum_seq_v") - .set_dim({b + 1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_data_type(fe::DataType_t::INT32)); - auto RAG_O_OFF = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("cum_seq_o") - .set_dim({b + 1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_data_type(fe::DataType_t::INT32)); - // auto RAG_STATS_OFF = mha_graph->tensor(fe::graph::Tensor_attributes() - // .set_name("cum_seq_stats") - // .set_dim({b + 1, 1, 1, 1}) - // .set_stride({1, 1, 1, 1}) - // .set_data_type(fe::DataType_t::INT32)); - auto RAG_STATS_OFF = nullptr; - Q->set_ragged_offset(RAG_Q_OFF); - K->set_ragged_offset(RAG_K_OFF); - V->set_ragged_offset(RAG_V_OFF); - auto [O, Stats] = - mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options); + auto RAG_Q_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_Q_OFF) + .set_name("cum_seq_q") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_K_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_K_OFF) + .set_name("cum_seq_k") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_V_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_V_OFF) + .set_name("cum_seq_v") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_O_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_O_OFF) + .set_name("cum_seq_o") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + Q_->set_ragged_offset(RAG_Q_OFF_); + K_->set_ragged_offset(RAG_K_OFF_); + V_->set_ragged_offset(RAG_V_OFF_); + auto [O_, Stats] = + mha_graph->sdpa(Q_, K_, V_, scaled_dot_product_flash_attention_options); auto o_strides = o.strides(); - O->set_output(true) + O_->set_output(true) + .set_uid(O) .set_dim({b, h_q, s_q, d_v}) .set_stride( {INT_MAX, @@ -593,16 +821,20 @@ auto build_graph_and_tensors_nestedtensor( o_strides[strideidx1], o_strides[strideidx2]}); - O->set_ragged_offset(RAG_O_OFF); + O_->set_ragged_offset(RAG_O_OFF_); if (Stats) { - TORCH_CHECK( - false, - "cuDNN SDPA Nested Tensor does not yet handle backwards/logsumexp computation"); - // TODO(eqy): fix when stats (backward) support is added + auto RAG_STATS_OFF = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_LSE_OFF) + .set_name("cum_seq_stats") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); Stats->set_output(true) + .set_uid(LSE) .set_data_type(fe::DataType_t::FLOAT) .set_dim({b, h_q, s_q, 1}) - .set_stride({h_q * s_q * d_v, d_v, s_q * d_v, 1}); + .set_stride({h_q * s_q, 1, h_q, 1}); Stats->set_ragged_offset(RAG_STATS_OFF); } AT_CUDNN_FRONTEND_CHECK(mha_graph->validate()); @@ -611,27 +843,10 @@ auto build_graph_and_tensors_nestedtensor( mha_graph->create_execution_plans({fe::HeurMode_t::A})); AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle)); AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle)); - return std::make_tuple( - std::move(mha_graph), - std::move(Q), - std::move(K), - std::move(V), - std::move(bias), - std::move(attn_scale), - std::move(seed), - std::move(offset), - std::move(O), - std::move(Stats), - std::move(RAG_Q_OFF), - std::move(RAG_K_OFF), - std::move(RAG_V_OFF), - std::move(RAG_O_OFF), - std::move(RAG_STATS_OFF), - std::move(SEQ_LEN_Q), - std::move(SEQ_LEN_KV)); + return mha_graph; } -auto build_graph_and_tensors_backward( +auto build_graph_backward( int64_t b, int64_t h, int64_t s_q, @@ -667,6 +882,7 @@ auto build_graph_and_tensors_backward( .set_compute_data_type(fe::DataType_t::FLOAT); auto attn_scale = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SCALE) .set_name("Attn_scale") .set_dim({1, 1, 1, 1}) .set_stride({1, 1, 1, 1}) @@ -676,87 +892,415 @@ auto build_graph_and_tensors_backward( .set_name("CUDNN_SDPA_BACKWARD") .set_causal_mask(is_causal) .set_attn_scale(attn_scale); - auto Q = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("Q") - .set_dim(q.sizes().vec()) - .set_stride(q.strides().vec())); - auto K = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("K") - .set_dim(k.sizes().vec()) - .set_stride(k.strides().vec())); - auto V = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("V") - .set_dim(v.sizes().vec()) - .set_stride(v.strides().vec())); + if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { + auto SEQ_LEN_Q_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEQ_LEN_Q) + .set_name("Seq_q") + .set_dim({b, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto SEQ_LEN_KV_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEQ_LEN_KV) + .set_name("Seq_kv") + .set_dim({b, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + sdpa_backward_options.set_seq_len_q(SEQ_LEN_Q_) + .set_seq_len_kv(SEQ_LEN_KV_) + .set_padding_mask(true); + } + + auto Q_ = mha_graph->tensor( + fe::graph::Tensor_attributes().set_uid(Q).set_name("Q")); + auto K_ = mha_graph->tensor( + fe::graph::Tensor_attributes().set_uid(K).set_name("K")); + auto V_ = mha_graph->tensor( + fe::graph::Tensor_attributes().set_uid(V).set_name("V")); std::optional> bias; if (attn_bias.has_value()) { bias = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(BIAS) .set_name("bias") .set_dim(attn_bias.value().sizes().vec()) .set_stride(attn_bias.value().strides().vec())); sdpa_backward_options.set_bias(bias.value()); } - auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("Seed") - .set_dim({1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_data_type( - dropoutseed.dtype() == kInt - ? fe::DataType_t::INT32 - : fe::DataType_t::INT64)); - - auto Offset = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("Offset") + if (dropout_probability != 0.0f) { + auto seed = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEED) + .set_name("Seed") .set_dim({1, 1, 1, 1}) .set_stride({1, 1, 1, 1}) .set_data_type( - dropoutoffset.dtype() == kInt + dropoutseed.dtype() == kInt ? fe::DataType_t::INT32 : fe::DataType_t::INT64)); - - auto O = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("O") - .set_dim(o.sizes().vec()) - .set_stride(o.strides().vec())); - auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes() + auto offset = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(OFFSET) + .set_name("Offset") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type( + dropoutoffset.dtype() == kInt + ? fe::DataType_t::INT32 + : fe::DataType_t::INT64)); + sdpa_backward_options.set_dropout(dropout_probability, seed, offset); + } + auto O_ = mha_graph->tensor( + fe::graph::Tensor_attributes().set_uid(O).set_name("O")); + auto Stats = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(LSE) .set_name("Stats") - .set_dim(softmaxstats.sizes().vec()) .set_stride(softmaxstats.strides().vec()) .set_data_type(fe::DataType_t::FLOAT)); - auto DO = mha_graph->tensor(fe::graph::Tensor_attributes() - .set_name("DO") - .set_dim(dO.sizes().vec()) - .set_stride(dO.strides().vec())); + auto Do = mha_graph->tensor( + fe::graph::Tensor_attributes().set_uid(DO).set_name("DO")); + auto [Dq, Dk, Dv] = mha_graph->sdpa_backward( + Q_, K_, V_, O_, Do, Stats, sdpa_backward_options); + Dq->set_uid(DQ).set_output(true); + Dk->set_uid(DK).set_output(true); + Dv->set_uid(DV).set_output(true); + if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { + auto RAG_Q_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_Q_OFF) + .set_name("cum_seq_q") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_K_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_K_OFF) + .set_name("cum_seq_k") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_V_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_V_OFF) + .set_name("cum_seq_v") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_O_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_O_OFF) + .set_name("cum_seq_o") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_STATS_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_LSE_OFF) + .set_name("cum_seq_stats") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + O_->set_ragged_offset(RAG_O_OFF_); + Q_->set_ragged_offset(RAG_Q_OFF_); + K_->set_ragged_offset(RAG_K_OFF_); + V_->set_ragged_offset(RAG_V_OFF_); + Dq->set_ragged_offset(RAG_Q_OFF_); + Dk->set_ragged_offset(RAG_K_OFF_); + Dv->set_ragged_offset(RAG_V_OFF_); + Do->set_ragged_offset(RAG_O_OFF_); + auto qsizevec = q.sizes().vec(); + auto ksizevec = k.sizes().vec(); + auto vsizevec = v.sizes().vec(); + auto osizevec = o.sizes().vec(); + qsizevec[2] = roundup_power2(qsizevec[2]); + ksizevec[2] = roundup_power2(ksizevec[2]); + vsizevec[2] = roundup_power2(vsizevec[2]); + osizevec[2] = roundup_power2(osizevec[2]); + // see corresponding section in the forward about the hardcoding + // of strides here + Q_->set_dim(qsizevec).set_stride( + {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1}); + K_->set_dim(ksizevec).set_stride( + {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1}); + V_->set_dim(vsizevec).set_stride( + {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1}); + O_->set_dim(osizevec).set_stride( + {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1}); + // should be identical to their non-d counterparts + Dq->set_dim(qsizevec).set_stride( + {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1}); + Dk->set_dim(ksizevec).set_stride( + {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1}); + Dv->set_dim(vsizevec).set_stride( + {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1}); + Do->set_dim(osizevec).set_stride( + {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1}); + + Stats->set_ragged_offset(RAG_STATS_OFF_); + auto statssizevec = softmaxstats.sizes().vec(); + statssizevec[2] = roundup_power2(statssizevec[2]); + Stats->set_dim(statssizevec); + } else { + O_->set_dim(o.sizes().vec()).set_stride(o.strides().vec()); + Q_->set_dim(q.sizes().vec()).set_stride(q.strides().vec()); + K_->set_dim(k.sizes().vec()).set_stride(k.strides().vec()); + V_->set_dim(v.sizes().vec()).set_stride(v.strides().vec()); + Dq->set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec()); + Dk->set_dim(dK.sizes().vec()).set_stride(dK.strides().vec()); + Dv->set_dim(dV.sizes().vec()).set_stride(dV.strides().vec()); + Do->set_dim(dO.sizes().vec()).set_stride(dO.strides().vec()); + Stats->set_dim(softmaxstats.sizes().vec()); + } + + AT_CUDNN_FRONTEND_CHECK(mha_graph->validate()); + AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle)); + AT_CUDNN_FRONTEND_CHECK( + mha_graph->create_execution_plans({fe::HeurMode_t::A})); + AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle)); + AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle)); + return mha_graph; +} + +auto build_graph_backward_nestedtensor( + int64_t b, + int64_t h_q, + int64_t h_k, + int64_t h_v, + int64_t s_q, + int64_t s_kv, + int64_t d_qk, + int64_t d_v, + float scaling_factor, + bool is_causal, + float dropout_probability, + const Tensor& cum_seqlen_q, + const Tensor& cum_seqlen_kv, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const std::optional& attn_bias, + const Tensor& o, + const Tensor& dO, + const Tensor& softmaxstats, + Tensor& dQ, + Tensor& dK, + Tensor& dV, + const Tensor& dropoutseed, + const Tensor& dropoutoffset, + cudnnHandle_t& handle) { + auto dtype = fe::DataType_t::HALF; + if (q.scalar_type() == kBFloat16) { + dtype = fe::DataType_t::BFLOAT16; + } + auto mha_graph = std::make_shared(); + // We're baking in float accumulation and scale types + // in theory the graph may support other types, but they + // have not been tested + mha_graph->set_io_data_type(dtype) + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); + auto attn_scale = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SCALE) + .set_name("Attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); + + auto SEQ_LEN_Q_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEQ_LEN_Q) + .set_name("Seq_q") + .set_dim({b, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto SEQ_LEN_KV_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEQ_LEN_KV) + .set_name("Seq_kv") + .set_dim({b, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto sdpa_backward_options = fe::graph::SDPA_backward_attributes() + .set_name("CUDNN_SDPA_NESTEDTENSOR_BACKWARD") + .set_causal_mask(is_causal) + .set_attn_scale(attn_scale) + .set_seq_len_q(SEQ_LEN_Q_) + .set_seq_len_kv(SEQ_LEN_KV_) + .set_padding_mask(true); if (dropout_probability != 0.0f) { - sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset); + auto seed = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(SEED) + .set_name("Seed") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type( + dropoutseed.dtype() == kInt + ? fe::DataType_t::INT32 + : fe::DataType_t::INT64)); + auto offset = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(OFFSET) + .set_name("Offset") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type( + dropoutoffset.dtype() == kInt + ? fe::DataType_t::INT32 + : fe::DataType_t::INT64)); + sdpa_backward_options.set_dropout(dropout_probability, seed, offset); } - auto [DQ, DK, DV] = - mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options); - DQ->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec()); - DK->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec()); - DV->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec()); + auto q_strides = q.strides(); + auto k_strides = k.strides(); + auto v_strides = v.strides(); + // NB: cuDNN API shape is transposed + constexpr int strideidx0 = 1; + constexpr int strideidx1 = 0; + constexpr int strideidx2 = 2; + auto Q_ = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(Q) + .set_name("Q") + .set_dim({b, h_q, s_q, d_qk}) + .set_stride( + {INT_MAX, + q_strides[strideidx0], + q_strides[strideidx1], + q_strides[strideidx2]})); + auto K_ = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(K) + .set_name("K") + .set_dim({b, h_k, s_kv, d_qk}) + .set_stride( + {INT_MAX, + k_strides[strideidx0], + k_strides[strideidx1], + k_strides[strideidx2]})); + auto V_ = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(V) + .set_name("V") + .set_dim({b, h_v, s_kv, d_v}) + .set_stride( + {INT_MAX, + v_strides[strideidx0], + v_strides[strideidx1], + v_strides[strideidx2]})); + auto o_strides = o.strides(); + auto O_ = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(O) + .set_name("O") + .set_dim({b, h_q, s_q, d_v}) + .set_stride( + {INT_MAX, + o_strides[strideidx0], + o_strides[strideidx1], + o_strides[strideidx2]})); + + std::optional> bias; + if (attn_bias.has_value()) { + TORCH_CHECK( + false, + "attn_bias not yet supportd with cuDNN Attention and NestedTensor"); + bias = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(BIAS) + .set_name("bias") + .set_dim(attn_bias.value().sizes().vec()) + .set_stride(attn_bias.value().strides().vec())); + sdpa_backward_options.set_bias(bias.value()); + } + auto RAG_Q_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_Q_OFF) + .set_name("cum_seq_q") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_K_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_K_OFF) + .set_name("cum_seq_k") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_V_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_V_OFF) + .set_name("cum_seq_v") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_O_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_O_OFF) + .set_name("cum_seq_o") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto RAG_STATS_OFF_ = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(RAG_LSE_OFF) + .set_name("cum_seq_stats") + .set_dim({b + 1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + O_->set_ragged_offset(RAG_O_OFF_); + Q_->set_ragged_offset(RAG_Q_OFF_); + K_->set_ragged_offset(RAG_K_OFF_); + V_->set_ragged_offset(RAG_V_OFF_); + auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_uid(LSE) + .set_name("stats") + .set_dim({b, h_q, s_q, 1}) + .set_stride({s_q * h_q, 1, h_q, 1}) + .set_data_type(fe::DataType_t::FLOAT)); + STATS->set_ragged_offset(RAG_STATS_OFF_); + auto do_strides = dO.strides(); + auto DO_ = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_ragged_offset(RAG_O_OFF_) + .set_uid(DO) + .set_name("DO") + .set_dim({b, h_q, s_q, d_v}) + .set_stride( + {INT_MAX, + do_strides[strideidx0], + do_strides[strideidx1], + do_strides[strideidx2]})); + auto [Dq, Dk, Dv] = mha_graph->sdpa_backward( + Q_, K_, V_, O_, DO_, STATS, sdpa_backward_options); + Dq->set_output(true) + .set_uid(DQ) + .set_ragged_offset(RAG_Q_OFF_) + .set_dim({b, h_q, s_q, d_qk}) + .set_stride( + {INT_MAX, + q_strides[strideidx0], + q_strides[strideidx1], + q_strides[strideidx2]}); + Dk->set_output(true) + .set_uid(DK) + .set_ragged_offset(RAG_K_OFF_) + .set_dim({b, h_k, s_kv, d_qk}) + .set_stride( + {INT_MAX, + k_strides[strideidx0], + k_strides[strideidx1], + k_strides[strideidx2]}); + Dv->set_output(true) + .set_uid(DV) + .set_ragged_offset(RAG_V_OFF_) + .set_dim({b, h_v, s_kv, d_v}) + .set_stride( + {INT_MAX, + v_strides[strideidx0], + v_strides[strideidx1], + v_strides[strideidx2]}); + AT_CUDNN_FRONTEND_CHECK(mha_graph->validate()); AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle)); AT_CUDNN_FRONTEND_CHECK( mha_graph->create_execution_plans({fe::HeurMode_t::A})); AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle)); AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle)); - return std::make_tuple( - std::move(mha_graph), - std::move(Q), - std::move(K), - std::move(V), - std::move(bias), - std::move(attn_scale), - std::move(Seed), - std::move(Offset), - std::move(O), - std::move(DO), - std::move(STATS), - std::move(DQ), - std::move(DK), - std::move(DV)); + return mha_graph; } void run_cudnn_SDP_fprop( @@ -778,31 +1322,61 @@ void run_cudnn_SDP_fprop( Tensor& o, Tensor& dropoutseed, Tensor& dropoutoffset) { - const auto dprops = at::cuda::getCurrentDeviceProperties(); - auto _dropoutseed = dropoutseed; - auto _dropoutoffset = dropoutoffset; - // cuDNN dropout bug requires these to be in int64 - if (dprops->major == 10 && dprops->minor == 0) { - _dropoutseed = dropoutseed.to(kLong); - _dropoutoffset = dropoutoffset.to(kLong); + // do nothing if we got 0-element tensors + if (!q.numel() || !k.numel() || !v.numel()) { + return; } + Tensor seqlen_q, seqlen_kv; + Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse; - cudnnHandle_t handle = getCudnnHandle(); if (!o.defined()) { // q is passed to us in BHSD dim order alloc_with_matching_layout(q, o, {b, h, s_q, d_v}); } - + bool use_ragged = use_ragged_in_dense(q, k, v, o, attn_bias.has_value()); if (return_softmaxstats && !softmaxstats.defined()) { - // TODO(eqy): verify that this is correct - softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat)); + // TODO(eqy): investigate why cuDNN doesn't like BSH layout softmaxstats + if (!use_ragged) { + softmaxstats = at::empty({b, h, s_q, 1}, q.options().dtype(kFloat)); + } else { + softmaxstats = + at::empty({b, s_q, h, 1}, q.options().dtype(kFloat)).transpose(1, 2); + } } - // do nothing if we got 0-element tensors - if (!q.numel() || !k.numel() || !v.numel()) { - return; + if (use_ragged) { + seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt)); + seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt)); + auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt)) + .cumsum(0, kInt) + .add_(-s_q); + auto cum_seqlen_kv = + at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt)) + .cumsum(0, kInt) + .add_(-s_kv); + rag_off_q = cum_seqlen_q.mul(q.stride(-2)); + rag_off_k = cum_seqlen_kv.mul(k.stride(-2)); + rag_off_v = cum_seqlen_kv.mul(v.stride(-2)); + rag_off_o = cum_seqlen_q.mul(o.stride(-2)); + if (return_softmaxstats) { + rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2)); + } } + const auto dprops = at::cuda::getCurrentDeviceProperties(); + auto _dropoutseed = dropoutseed; + auto _dropoutoffset = dropoutoffset; + // cuDNN dropout bug requires these to be in int64 + if (dprops->major == 10 && dprops->minor == 0) { + _dropoutseed = dropoutseed.to(kLong); + _dropoutoffset = dropoutoffset.to(kLong); + } + + cudnnHandle_t handle = getCudnnHandle(); + + // NB: The key initialization will round up sequence length, stride data etc. + // if use_ragged_in_dense is enabled (to allow multiple sequence lenghths to + // reuse the same cached value/graph) auto key = MHACacheKeyWrapper( b, h, @@ -816,13 +1390,14 @@ void run_cudnn_SDP_fprop( attn_bias, dropout_probability, is_causal, - return_softmaxstats); - auto graph_and_tensors_ptr = mhagraphcache.find(key); - graph_and_tensors graph_and_tensors_values; - if (graph_and_tensors_ptr) { - graph_and_tensors_values = *graph_and_tensors_ptr; + return_softmaxstats, + false); + auto graph_ptr = getMHAGraphCache_().find(key); + std::shared_ptr mha_graph; + if (graph_ptr) { + mha_graph = *graph_ptr; } else { - graph_and_tensors_values = build_graph_and_tensors( + mha_graph = build_graph( b, h, s_q, @@ -843,29 +1418,39 @@ void run_cudnn_SDP_fprop( _dropoutoffset, handle); } - auto [mha_graph, Q, K, V, bias, attn_scale, seed, offset, O, Stats] = - graph_and_tensors_values; - std::unordered_map, void*> - variant_pack = { - {Q, q.data_ptr()}, - {K, k.data_ptr()}, - {V, v.data_ptr()}, - {attn_scale, &scaling_factor}, - {seed, _dropoutseed.data_ptr()}, - {offset, _dropoutoffset.data_ptr()}, - {O, o.data_ptr()}}; + std::unordered_map variant_pack = { + {Q, q.data_ptr()}, + {K, k.data_ptr()}, + {V, v.data_ptr()}, + {SCALE, &scaling_factor}, + {O, o.data_ptr()}}; if (return_softmaxstats) { - variant_pack[Stats] = softmaxstats.data_ptr(); + variant_pack[LSE] = softmaxstats.data_ptr(); } if (attn_bias.has_value()) { - variant_pack[bias.value()] = attn_bias.value().data_ptr(); + variant_pack[BIAS] = attn_bias.value().data_ptr(); + } + if (dropout_probability != 0.0f) { + variant_pack[SEED] = _dropoutseed.data_ptr(); + variant_pack[OFFSET] = _dropoutoffset.data_ptr(); + } + if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { + variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr(); + variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr(); + variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr(); + variant_pack[RAG_K_OFF] = rag_off_k.data_ptr(); + variant_pack[RAG_V_OFF] = rag_off_v.data_ptr(); + variant_pack[RAG_O_OFF] = rag_off_o.data_ptr(); + if (return_softmaxstats) { + variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr(); + } } auto workspace_size = mha_graph->get_workspace_size(); auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); TORCH_CHECK( mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good()); - mhagraphcache.update(key, graph_and_tensors_values); + getMHAGraphCache_().update(key, mha_graph); } void run_cudnn_SDP_fprop_nestedtensor( @@ -904,72 +1489,78 @@ void run_cudnn_SDP_fprop_nestedtensor( if (return_softmaxstats && !softmaxstats.defined()) { softmaxstats = at::empty({q.size(0), h_q, 1}, q.options().dtype(kFloat)); } - auto - [mha_graph, - Q, - K, - V, - bias, - attn_scale, - seed, - offset, - O, - Stats, - RAG_Q_OFF, - RAG_K_OFF, - RAG_V_OFF, - RAG_O_OFF, - RAG_STATS_OFF, - SEQ_LEN_Q, - SEQ_LEN_KV] = - build_graph_and_tensors_nestedtensor( - b, - h_q, - h_k, - h_v, - s_q, - s_kv, - d_qk, - d_v, - scaling_factor, - return_softmaxstats, - is_causal, - dropout_probability, - cum_seqlen_q, - cum_seqlen_kv, - q, - k, - v, - attn_bias, - softmaxstats, - o, - dropoutseed, - dropoutoffset, - handle); + + auto key = MHACacheKeyWrapper( + b, + h_q, + s_q, // max-seqlen-q + s_kv, // max-seqlen-kv + d_qk, + d_v, + q, + k, + v, + attn_bias, + dropout_probability, + is_causal, + return_softmaxstats, + true); + auto graph_ptr = getMHAGraphCache_().find(key); + std::shared_ptr mha_graph; + + if (graph_ptr) { + mha_graph = *graph_ptr; + } else { + mha_graph = build_graph_nestedtensor( + b, + h_q, + h_k, + h_v, + s_q, + s_kv, + d_qk, + d_v, + scaling_factor, + return_softmaxstats, + is_causal, + dropout_probability, + cum_seqlen_q, + cum_seqlen_kv, + q, + k, + v, + attn_bias, + softmaxstats, + o, + dropoutseed, + dropoutoffset, + handle); + } auto seqlen_q = at::diff(cum_seqlen_q, 1, 0); auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0); auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk); - auto rag_k_off = cum_seqlen_kv.mul(h_k * d_qk); + auto rag_k_off = cum_seqlen_kv.mul(h_k * d_v); auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v); auto rag_stats_off = cum_seqlen_q.mul(h_q); - std::unordered_map, void*> - variant_pack = { - {Q, q.data_ptr()}, - {K, k.data_ptr()}, - {V, v.data_ptr()}, - {attn_scale, &scaling_factor}, - {seed, dropoutseed.data_ptr()}, - {offset, dropoutoffset.data_ptr()}, - {O, o.data_ptr()}, - {RAG_Q_OFF, rag_q_off.data_ptr()}, - {RAG_O_OFF, rag_q_off.data_ptr()}, - {RAG_K_OFF, rag_k_off.data_ptr()}, - {RAG_V_OFF, rag_v_off.data_ptr()}, - {SEQ_LEN_Q, seqlen_q.data_ptr()}, - {SEQ_LEN_KV, seqlen_kv.data_ptr()}}; + std::unordered_map variant_pack = { + {Q, q.data_ptr()}, + {K, k.data_ptr()}, + {V, v.data_ptr()}, + {SCALE, &scaling_factor}, + {O, o.data_ptr()}, + {RAG_Q_OFF, rag_q_off.data_ptr()}, + {RAG_O_OFF, rag_q_off.data_ptr()}, + {RAG_K_OFF, rag_k_off.data_ptr()}, + {RAG_V_OFF, rag_v_off.data_ptr()}, + {SEQ_LEN_Q, seqlen_q.data_ptr()}, + {SEQ_LEN_KV, seqlen_kv.data_ptr()}}; if (return_softmaxstats) { - variant_pack[Stats] = softmaxstats.data_ptr(); - variant_pack[RAG_STATS_OFF] = cum_seqlen_q.data_ptr(); + variant_pack[LSE] = softmaxstats.data_ptr(); + variant_pack[RAG_LSE_OFF] = rag_stats_off.data_ptr(); + } + if (dropout_probability != 0.0f) { + variant_pack[SEED] = dropoutseed.data_ptr(); + variant_pack[OFFSET] = dropoutoffset.data_ptr(); } if (attn_bias.has_value()) { TORCH_CHECK("bias not supported with nestedtensor"); @@ -1008,6 +1599,9 @@ void run_cudnn_SDP_bprop( !softmaxstats.numel()) { return; } + Tensor seqlen_q, seqlen_kv; + Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse; + auto dprops = at::cuda::getCurrentDeviceProperties(); auto _dropoutseed = dropoutseed; auto _dropoutoffset = dropoutoffset; @@ -1034,10 +1628,28 @@ void run_cudnn_SDP_bprop( "with matching strides..."); #else const auto innermost_dO_stride = dO.strides()[dO.strides().size() - 1]; - if (innermost_dO_stride != 1) { + if (innermost_dO_stride != 1 || + use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { permute_to_matching_layout(o, dO_); } #endif + if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { + seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt)); + seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt)); + auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt)) + .cumsum(0, kInt) + .add_(-s_q); + auto cum_seqlen_kv = + at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt)) + .cumsum(0, kInt) + .add_(-s_kv); + rag_off_q = cum_seqlen_q.mul(q.stride(-2)); + rag_off_k = cum_seqlen_kv.mul(k.stride(-2)); + rag_off_v = cum_seqlen_kv.mul(v.stride(-2)); + rag_off_o = cum_seqlen_q.mul(o.stride(-2)); + rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2)); + } + cudnnHandle_t handle = getCudnnHandle(); auto key = MHACacheKeyWrapper( b, @@ -1052,13 +1664,14 @@ void run_cudnn_SDP_bprop( attn_bias, dropout_probability, is_causal, - true); - auto graph_and_tensors_backward_ptr = mhagraphbackwardcache.find(key); - graph_and_tensors_backward graph_and_tensors_backward_values; - if (graph_and_tensors_backward_ptr) { - graph_and_tensors_backward_values = *graph_and_tensors_backward_ptr; + true, + false); + auto graph_backward_ptr = getMHAGraphBackwardCache_().find(key); + std::shared_ptr mha_graph; + if (graph_backward_ptr) { + mha_graph = *graph_backward_ptr; } else { - graph_and_tensors_backward_values = build_graph_and_tensors_backward( + mha_graph = build_graph_backward( b, h, s_q, @@ -1082,49 +1695,185 @@ void run_cudnn_SDP_bprop( _dropoutoffset, handle); } - auto - [mha_graph, - Q, - K, - V, - bias, - attn_scale, - Seed, - Offset, - O, - Do, - Stats, - Dq, - Dk, - Dv] = graph_and_tensors_backward_values; - std::unordered_map, void*> - variant_pack = {// inputs - {Q, q.data_ptr()}, - {K, k.data_ptr()}, - {V, v.data_ptr()}, - {O, o.data_ptr()}, - {Do, dO_.data_ptr()}, - {Stats, softmaxstats.data_ptr()}, - // outputs - {Dq, dQ.data_ptr()}, - {Dk, dK.data_ptr()}, - {Dv, dV.data_ptr()}, - // pass by value - {attn_scale, &scaling_factor}}; + std::unordered_map variant_pack = { + // inputs + {Q, q.data_ptr()}, + {K, k.data_ptr()}, + {V, v.data_ptr()}, + {O, o.data_ptr()}, + {DO, dO_.data_ptr()}, + {LSE, softmaxstats.data_ptr()}, + // outputs + {DQ, dQ.data_ptr()}, + {DK, dK.data_ptr()}, + {DV, dV.data_ptr()}, + {SCALE, &scaling_factor}}; if (dropout_probability != 0.0f) { - variant_pack[Seed] = _dropoutseed.data_ptr(); - variant_pack[Offset] = _dropoutoffset.data_ptr(); + variant_pack[SEED] = _dropoutseed.data_ptr(); + variant_pack[OFFSET] = _dropoutoffset.data_ptr(); } if (attn_bias.has_value()) { - variant_pack[bias.value()] = attn_bias.value().data_ptr(); + variant_pack[BIAS] = attn_bias.value().data_ptr(); + } + if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { + variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr(); + variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr(); + variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr(); + variant_pack[RAG_K_OFF] = rag_off_k.data_ptr(); + variant_pack[RAG_V_OFF] = rag_off_v.data_ptr(); + variant_pack[RAG_O_OFF] = rag_off_o.data_ptr(); + variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr(); + } + + auto workspace_size = mha_graph->get_workspace_size(); + auto workspace_ptr = + c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); + TORCH_CHECK(!workspace_size || workspace_ptr.get()); + TORCH_CHECK( + mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good()); + getMHAGraphBackwardCache_().update(key, mha_graph); +} + +void run_cudnn_SDP_bprop_nestedtensor( + int64_t b, + int64_t h_q, + int64_t h_k, + int64_t h_v, + int64_t s_q, + int64_t s_kv, + int64_t d_qk, + int64_t d_v, + float scaling_factor, + bool is_causal, + float dropout_probability, + const Tensor& cum_seqlen_q, + const Tensor& cum_seqlen_kv, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const std::optional& attn_bias, + const Tensor& o, + const Tensor& dO, + const Tensor& softmaxstats, + Tensor& dQ, + Tensor& dK, + Tensor& dV, + const Tensor& dropoutseed, + const Tensor& dropoutoffset) { + // do nothing if we got 0-element tensors + if (!q.numel() || !k.numel() || !v.numel() || !o.numel() || !dO.numel() || + !softmaxstats.numel()) { + return; } + + Tensor dO_ = dO; + const auto innermost_dO_stride = dO.strides()[dO.strides().size() - 1]; + if (innermost_dO_stride != 1) { + permute_to_matching_layout(o, dO_); + } + + auto seqlen_q = at::diff(cum_seqlen_q, 1, 0); + auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0); + auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk); + auto rag_k_off = cum_seqlen_kv.mul(h_k * d_v); + auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v); + auto rag_stats_off = cum_seqlen_q.mul(h_q); + + auto dprops = at::cuda::getCurrentDeviceProperties(); + auto _dropoutseed = dropoutseed; + auto _dropoutoffset = dropoutoffset; + // cuDNN dropout bug requires these to be in int64 + if (dprops->major == 10 && dprops->minor == 0) { + _dropoutseed = dropoutseed.to(kLong); + _dropoutoffset = dropoutoffset.to(kLong); + } + + cudnnHandle_t handle = getCudnnHandle(); + + auto key = MHACacheKeyWrapper( + b, + h_q, + s_q, // max-seqlen-q + s_kv, // max-seqlen-kv + d_qk, + d_v, + q, + k, + v, + attn_bias, + dropout_probability, + is_causal, + true, + true); + auto graph_ptr = getMHAGraphCache_().find(key); + std::shared_ptr mha_graph; + + if (graph_ptr) { + mha_graph = *graph_ptr; + } else { + mha_graph = build_graph_backward_nestedtensor( + b, + h_q, + h_k, + h_v, + s_q, + s_kv, + d_qk, + d_v, + scaling_factor, + is_causal, + dropout_probability, + cum_seqlen_q, + cum_seqlen_kv, + q, + k, + v, + attn_bias, + o, + dO_, + softmaxstats, + dQ, + dK, + dV, + dropoutseed, + dropoutoffset, + handle); + } + + std::unordered_map variant_pack = { + // inputs + {Q, q.data_ptr()}, + {K, k.data_ptr()}, + {V, v.data_ptr()}, + {O, o.data_ptr()}, + {DO, dO_.data_ptr()}, + {LSE, softmaxstats.data_ptr()}, + // outputs + {DQ, dQ.data_ptr()}, + {DK, dK.data_ptr()}, + {DV, dV.data_ptr()}, + {SCALE, &scaling_factor}, + {RAG_Q_OFF, rag_q_off.data_ptr()}, + {RAG_O_OFF, rag_q_off.data_ptr()}, + {RAG_K_OFF, rag_k_off.data_ptr()}, + {RAG_V_OFF, rag_v_off.data_ptr()}, + {RAG_LSE_OFF, rag_stats_off.data_ptr()}, + {SEQ_LEN_Q, seqlen_q.data_ptr()}, + {SEQ_LEN_KV, seqlen_kv.data_ptr()}}; + if (dropout_probability != 0.0f) { + variant_pack[SEED] = _dropoutseed.data_ptr(); + variant_pack[OFFSET] = _dropoutoffset.data_ptr(); + } + TORCH_CHECK( + !attn_bias.has_value(), + "attn_bias not yet supportd with cuDNN Attention and NestedTensor"); + auto workspace_size = mha_graph->get_workspace_size(); auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); TORCH_CHECK(!workspace_size || workspace_ptr.get()); TORCH_CHECK( mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good()); - mhagraphbackwardcache.update(key, graph_and_tensors_backward_values); } } // namespace native diff --git a/aten/src/ATen/native/cudnn/MHA.h b/aten/src/ATen/native/cudnn/MHA.h index 045e8cf6dee9d..620abc1aa0a8e 100644 --- a/aten/src/ATen/native/cudnn/MHA.h +++ b/aten/src/ATen/native/cudnn/MHA.h @@ -70,4 +70,31 @@ void run_cudnn_SDP_bprop( const Tensor& dropoutseed, const Tensor& dropoutoffset); +void run_cudnn_SDP_bprop_nestedtensor( + int64_t b, + int64_t h_q, + int64_t h_k, + int64_t h_v, + int64_t s_q, + int64_t s_kv, + int64_t d_qk, + int64_t d_v, + float scaling_factor, + bool is_causal, + float dropout_probability, + const Tensor& cum_seqlen_q, + const Tensor& cum_seqlen_kv, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const std::optional& attn_bias, + const Tensor& o, + const Tensor& dO, + const Tensor& softmaxstats, + Tensor& dQ, + Tensor& dK, + Tensor& dV, + const Tensor& dropoutseed, + const Tensor& dropoutoffset); + } // namespace at::native diff --git a/aten/src/ATen/native/hip/ck_gemm.h b/aten/src/ATen/native/hip/ck_gemm.h index 176cbabd5e01c..0d42cad56fcda 100644 --- a/aten/src/ATen/native/hip/ck_gemm.h +++ b/aten/src/ATen/native/hip/ck_gemm.h @@ -10,6 +10,7 @@ inline void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(Dtype)) { static_assert(false&&sizeof(Dtype),"at::cuda::blas_gemm_internal_ck: not implemented"); } +#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) template <> void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(double)); template <> @@ -18,7 +19,7 @@ template <> void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(at::Half)); template <> void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); - +#endif } // namespace at::native diff --git a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip index 79cb14be41031..7561cede386fb 100644 --- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip +++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip @@ -1,6 +1,7 @@ #undef __HIP_NO_HALF_CONVERSIONS__ - #include + +#if defined(USE_ROCM_CK_GEMM) #include #include @@ -781,3 +782,4 @@ void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { } } // namespace at::native +#endif // USE_ROCM_CK_GEMM diff --git a/aten/src/ATen/native/hip/ck_gemm_float.hip b/aten/src/ATen/native/hip/ck_gemm_float.hip index b8301a47981c6..c4fea6088d3f0 100644 --- a/aten/src/ATen/native/hip/ck_gemm_float.hip +++ b/aten/src/ATen/native/hip/ck_gemm_float.hip @@ -1,6 +1,7 @@ #undef __HIP_NO_HALF_CONVERSIONS__ #include +#if defined(USE_ROCM_CK_GEMM) #include #include @@ -484,3 +485,4 @@ void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(double)) { } } // namespace at::native +#endif // USE_ROCM_CK_GEMM diff --git a/aten/src/ATen/native/hip/ck_gemm_half.hip b/aten/src/ATen/native/hip/ck_gemm_half.hip index 552f0de845418..ebe044c389721 100644 --- a/aten/src/ATen/native/hip/ck_gemm_half.hip +++ b/aten/src/ATen/native/hip/ck_gemm_half.hip @@ -1,6 +1,7 @@ #undef __HIP_NO_HALF_CONVERSIONS__ #include +#if defined(USE_ROCM_CK_GEMM) #include #include @@ -606,3 +607,4 @@ void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(at::Half)) { } } // namespace at::native +#endif // USE_ROCM_CK_GEMM diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 154118d9f2728..41226680c4b58 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #endif // TODO: Remove the condition on AT_ROCM_ENABLED entirely, @@ -145,13 +146,13 @@ at::Tensor miopen_convolution_relu( #include #include +#include #include #include #include #include -#include #include #include #include @@ -162,10 +163,13 @@ at::Tensor miopen_convolution_relu( namespace at { namespace native { -Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { - auto group_size = t.size(dim) / groups; - return t.narrow(dim, group_idx * group_size, group_size); -} +// See NOTE [ Convolution design ] in aten/src/ATen/native/cudnn/ConvShared.cpp + +// --------------------------------------------------------------------- +// +// Helper classes +// +// --------------------------------------------------------------------- // This POD struct is used to let us easily compute hashes of the // parameters @@ -174,6 +178,8 @@ struct ConvolutionParams miopenHandle_t handle; miopenDataType_t dataType; int input_size[2 + max_dim]; + uint8_t input_dim; + at::MemoryFormat memory_format; int input_stride[2 + max_dim]; int weight_size[2 + max_dim]; int padding[max_dim]; @@ -181,25 +187,29 @@ struct ConvolutionParams int dilation[max_dim]; int64_t groups; bool deterministic; - int device_id; //This is needed to distinguish between miopen handles of multiple gpus. + c10::DeviceIndex device_id; //This is needed to distinguish between miopen handles of multiple gpus. // NB: transposed purposely omitted: transposed just swaps // forward and backward, so you can reuse the benchmark entry, }; -// ConvolutionParams must be a POD because we read out its memory -// contenst as char* when hashing -static_assert(std::is_standard_layout_v, "ConvolutionParams not POD"); void setConvolutionParams( - ConvolutionParams* params, miopenHandle_t handle, - const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool deterministic) { - + ConvolutionParams* params, + miopenHandle_t handle, + const at::Tensor& input, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool deterministic, + at::MemoryFormat memory_format) { miopenDataType_t dataType = getMiopenDataType(input); memset(params, 0, sizeof(ConvolutionParams)); params->dataType = dataType; params->handle = handle; // ASSERT(weight.dim() == input.dim()) + params->input_dim = input.dim(); + params->memory_format = memory_format; for (int i = 0; i != input.dim(); ++i) { params->input_size[i] = (int) input.size(i); params->input_stride[i] = (int) input.stride(i); @@ -214,9 +224,7 @@ void setConvolutionParams( } params->groups = groups; params->deterministic = deterministic; - int device_id; - HIP_CHECK(hipGetDevice(&device_id)); - params->device_id = device_id; + params->device_id = at::cuda::current_device(); } // Convenience struct for passing around descriptors and data @@ -239,31 +247,10 @@ struct ConvolutionArgs { // // --------------------------------------------------------------------- -// Hashing machinery for ConvolutionParams -struct ParamsHash { - std::size_t operator()(const ConvolutionParams& params) const { - auto ptr = reinterpret_cast(¶ms); - uint32_t value = 0x811C9DC5; - for (const auto i : c10::irange((int)sizeof(ConvolutionParams))) { - value ^= ptr[i]; - value *= 0x01000193; - } - return (size_t)value; - } -}; - -struct ParamsEqual { - bool operator()(const ConvolutionParams& a, const ConvolutionParams& b) const { - auto ptr1 = reinterpret_cast(&a); - auto ptr2 = reinterpret_cast(&b); - return memcmp(ptr1, ptr2, sizeof(ConvolutionParams)) == 0; - } -}; - template struct BenchmarkCache { std::mutex mutex; - std::unordered_map map; + std::unordered_map, ParamsEqual> map; bool find(const ConvolutionParams& params, T* results) { std::lock_guard guard(mutex); @@ -314,39 +301,39 @@ size_t getWorkspaceSize( const ConvolutionArgs& args, const miopenConvFwdAlgorithm_t) { size_t sz = 0; - miopenConvolutionForwardGetWorkSpaceSize( + MIOPEN_CHECK(miopenConvolutionForwardGetWorkSpaceSize( args.handle, args.wdesc.desc(), args.idesc.desc(), args.cdesc.desc(), args.odesc.desc(), - &sz); + &sz)); return sz; } size_t getWorkspaceSize( const ConvolutionArgs& args, const miopenConvBwdDataAlgorithm_t) { size_t sz = 0; - miopenConvolutionBackwardDataGetWorkSpaceSize( + MIOPEN_CHECK(miopenConvolutionBackwardDataGetWorkSpaceSize( args.handle, args.odesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.idesc.desc(), - &sz); + &sz)); return sz; } size_t getWorkspaceSize( const ConvolutionArgs& args, const miopenConvBwdWeightsAlgorithm_t) { size_t sz = 0; - miopenConvolutionBackwardWeightsGetWorkSpaceSize( + MIOPEN_CHECK(miopenConvolutionBackwardWeightsGetWorkSpaceSize( args.handle, args.odesc.desc(), args.idesc.desc(), args.cdesc.desc(), args.wdesc.desc(), - &sz); + &sz)); return sz; } @@ -649,6 +636,94 @@ Workspace chooseSolution(const ConvolutionArgs& args, uint64_t* solution_id) } } +// See NOTE [ raw_cudnn_convolution_forward_out ] in aten/src/ATen/native/cudnn/Conv_v7.cpp + +// --------------------------------------------------------------------- +// +// Splitting to 32bit +// +// --------------------------------------------------------------------- + +template +static inline void split_batch_dim_to_32bit_out( + const at::Tensor& output, + const at::Tensor& input, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise, + int64_t max_worksize, + func_t func_32bit) { + constexpr int64_t int_max = std::numeric_limits::max(); + const int64_t ni = input.numel(); + const int64_t no = output.numel(); + // Assume the shape of the tensor is (N, C, D1, D2, ...) + // if N * C * D1 * D2 * ... <= int_max, then no need to split at all + if (ni <= int_max && no <= int_max) { + func_32bit( + output, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); + return; + } + // else, if C * D1 * D2 * ... <= int_max, then we just need to split across + // the N dimension + // + // Here we use a simple heuristics to determine the size of each split + // We don't max out the 2^31 address space because this number is super + // large and very likely to get an OOM. + int64_t n = output.size(0); + int64_t max_inner_size = std::max(ni, no) / n; + int64_t split_size = std::max(max_worksize / max_inner_size, 1L); + int64_t num_splits = (n + split_size - 1) / split_size; + if (split_size * max_inner_size < int_max) { + for (const auto i : c10::irange(num_splits)) { + int64_t start = split_size * i; + int64_t split_size_ = std::min(split_size, n - start); + Tensor input_ = input.narrow(0, start, split_size_); + Tensor output_ = output.narrow(0, start, split_size_); + func_32bit( + output_, + input_, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); + } + return; + } + // If control flow reaches here, this means even splitting N is not enough, + // then things starts to become complicated: For example, for conv2d, there + // following questions needs to be considered. + // - Is the memory layout NCHW or NHWC ? + // - If the conv is NCHW -> NC'H'W', then should we + // - split only NC? + // - split only N'C'? + // - split both? + // - If the conv is NHWC, then we need to split across H, we need to be very + // careful about the boundary condition + // to make sure that the boundary is handled correctly. + // - If we decide to make these splits, is the memory contiguous? Do we need + // to copy the memory? Considering the complexity of this issue, it is better + // not to use cuDNN for this case + TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN."); +} + // --------------------------------------------------------------------- // // Bias addition @@ -690,8 +765,47 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const */ } -// see NOTE [ Convolution design ] in src/Aten/native/cudnn/Conv.cpp +Tensor miopen_convolution_backward_bias(const Tensor& grad_output_t) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }; + + // TODO: Workaround since MIOpen does not support NHWC bias + // See #64426 + std::vector discard_dims; + for( int i = 0; i < grad_output_t.dim(); i++ ) { + if(i != output_channels_dim ) { + discard_dims.push_back(i); + } + } + + Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) ); + if( outputBias.dim() == 0 ) { + // always return a tensor of shape [_] + return outputBias.unsqueeze(0); + } + else { + return outputBias; + } + +/* MIOpen does not support NHWC bias. Activate once support is added. + auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options()); + + TensorArg grad_bias{ grad_bias_t, "result", 0 }; + + TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}), + static_cast(grad_output->dim())}; + TensorDescriptor odesc{*grad_output}; + + auto handle = getMiopenHandle(); + auto dataType = getMiopenDataType(*grad_bias); + Constant one(dataType, 1); + Constant zero(dataType, 0); + MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(), + &zero, bdesc.desc(), grad_bias->data_ptr())); + return *grad_bias; +*/ +} // --------------------------------------------------------------------- // @@ -699,30 +813,47 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const // // --------------------------------------------------------------------- -// The raw API directly invokes MIOpen. -// -// There are a few reasons this should never be directly exposed -// via ATen: -// -// - It takes output as a parameter (this should be computed!) -// - It doesn't do input checking -// - It doesn't resize output (it is assumed to be correctly sized) -// -void raw_miopen_convolution_forward_out( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - +void raw_miopen_convolution_forward_out_32bit( + const Tensor& output, + const Tensor& input, + const Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenConvolution; + miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution; - ConvolutionArgs args{ input, output, weight }; + ConvolutionArgs args{input, output, weight}; args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(weight, input.suggest_memory_format(), 0); - args.odesc.set(output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); + at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight); + setConvolutionParams( + &args.params, + args.handle, + input, + weight, + padding, + stride, + dilation, + groups, + deterministic, + memory_format); + args.idesc.set(input, memory_format); + args.wdesc.set(weight, memory_format, 0); + args.odesc.set(output, memory_format); + args.cdesc.set( + dataType, + c_mode, + input.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + benchmark, + deterministic); if (at::globalContext().immediateMiopen()) { uint64_t solution_id; @@ -730,10 +861,16 @@ void raw_miopen_convolution_forward_out( MIOPEN_CHECK(miopenConvolutionForwardImmediate( args.handle, - args.wdesc.desc(), weight.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), + args.wdesc.desc(), + weight.const_data_ptr(), + args.idesc.desc(), + input.const_data_ptr(), args.cdesc.desc(), - args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id)); + args.odesc.desc(), + output.data_ptr(), + workspace.data, + workspace.size, + solution_id)); } else { miopenConvFwdAlgorithm_t fwdAlg; @@ -744,472 +881,216 @@ void raw_miopen_convolution_forward_out( MIOPEN_CHECK(miopenConvolutionForward( args.handle, - &one, args.idesc.desc(), input.const_data_ptr(), - args.wdesc.desc(), weight.const_data_ptr(), - args.cdesc.desc(), fwdAlg, &zero, - args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size)); + &one, + args.idesc.desc(), + input.const_data_ptr(), + args.wdesc.desc(), + weight.const_data_ptr(), + args.cdesc.desc(), + fwdAlg, + &zero, + args.odesc.desc(), + output.data_ptr(), + workspace.data, + workspace.size)); } } -Tensor miopen_convolution_forward( +void raw_miopen_convolution_forward_out( + const Tensor& output, + const Tensor& input, + const Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + split_batch_dim_to_32bit_out( + output, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise, + 1024 * 1024 * 256, + raw_miopen_convolution_forward_out_32bit); +} + +void miopen_convolution_forward_out( + TensorArg& output, CheckedFrom c, - const TensorArg& input, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ + const TensorArg& input, + const TensorArg& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { checkAllSameType(c, {input, weight}); checkAllSameGPU(c, {input, weight}); - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *weight)) { - memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - - Tensor output_t = at::detail::empty_cuda( - conv_output_size(input->sizes(), weight->sizes(), - padding, stride, dilation), - input->options().memory_format(memory_format)); - - if (output_t.numel() == 0) { - return output_t; - } - - // Avoid ambiguity of "output" when this is being used as backwards - TensorArg output{ output_t, "result", 0 }; - convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); + auto memory_format = output->suggest_memory_format(); + convolution_shape_check( + c, input, weight, output, padding, stride, dilation, groups); - // See #4500 Tensor weight_contig = weight->contiguous(memory_format); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), memory_format); Tensor input_contig = input->contiguous(memory_format); - input_contig.resize_(input_contig.sizes(), memory_format); - - raw_miopen_convolution_forward_out( - *output, input_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return *output; + *output, + input_contig, + weight_contig, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); } Tensor miopen_convolution( - const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) -{ + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_t_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt); const Tensor& bias_t = *bias_t_maybe_owned; - TensorArg input { input_t, "input", 1 }, - weight { weight_t, "weight", 2 }, - bias { bias_t, "bias", 3 }; + TensorArg input{input_t, "input", 1 }, weight{weight_t, "weight", 2}, bias{bias_t, "bias", 3}; CheckedFrom c = "miopen_convolution"; - auto output_t = miopen_convolution_forward( - c, input, weight, padding, stride, dilation, groups, benchmark, deterministic); + auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input->options().memory_format(memory_format)); + if (output_t.numel() == 0) { + return output_t; + } + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, + c, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); if (bias->defined()) { - miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + miopen_convolution_add_bias_(c, output, bias); } - return output_t; + return *output; } -//Depthwise Convolutions -void raw_miopen_depthwise_convolution_forward_out( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { +Tensor miopen_convolution_transpose_backward_input( + const Tensor& grad_output_t, + const Tensor& weight_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, weight{weight_t, "weight", 2}; + auto memory_format = + miopen_conv_suggest_memory_format(grad_output_t, weight_t); + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + grad_output_t.sizes(), weight_t.sizes(), padding, stride, dilation), + grad_output_t.options().memory_format(memory_format)); - auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenDepthwise; + if (output_t.numel() == 0) { + return output_t; + } + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, + "miopen_convolution_transpose_backward_input", + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); + return *output; +} - ConvolutionArgs args{ input, output, weight }; - args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(weight, input.suggest_memory_format(), 0); - args.odesc.set(output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); +// file organization would put miopen_convolution_transpose_backward_weight here, +// but it depends on miopen_convolution_backward_weight which is defined later +Tensor miopen_convolution_transpose_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic); - if (at::globalContext().immediateMiopen()) { - uint64_t solution_id; - Workspace workspace = chooseSolution(args, &solution_id); +std::tuple miopen_convolution_transpose_backward( + const at::Tensor& input, + const at::Tensor& grad_output_t, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + std::array output_mask) { + Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); - MIOPEN_CHECK(miopenConvolutionForwardImmediate( - args.handle, - args.wdesc.desc(), weight.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), - args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id)); + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = miopen_convolution_transpose_backward_input( + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); } - else { - miopenConvFwdAlgorithm_t fwdAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &fwdAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionForward( - args.handle, - &one, args.idesc.desc(), input.const_data_ptr(), - args.wdesc.desc(), weight.const_data_ptr(), - args.cdesc.desc(), fwdAlg, &zero, - args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size)); + if (output_mask[1]) { + grad_weight = miopen_convolution_transpose_backward_weight( + weight.sizes(), + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); + } + if (output_mask[2]) { + grad_bias = miopen_convolution_backward_bias(grad_output); } -} - -Tensor miopen_depthwise_convolution_forward( - CheckedFrom c, - const TensorArg& input, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - checkAllSameType(c, {input, weight}); - checkAllSameGPU(c, {input, weight}); - - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *weight)) { - memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - - Tensor output_t = at::detail::empty_cuda( - conv_output_size(input->sizes(), weight->sizes(), - padding, stride, dilation), - input->options().memory_format(memory_format)); - - TensorArg output{ output_t, "result", 0 }; - convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); - - // See #4500 - Tensor weight_contig = weight->contiguous(memory_format); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), memory_format); - Tensor input_contig = input->contiguous(memory_format); - input_contig.resize_(input_contig.sizes(), memory_format); - - raw_miopen_depthwise_convolution_forward_out( - *output, input_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return *output; -} - -Tensor miopen_depthwise_convolution( - const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) -{ - // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt); - const Tensor& bias_t = *bias_t_maybe_owned; - - TensorArg input { input_t, "input", 1 }, - weight { weight_t, "weight", 2 }, - bias { bias_t, "bias", 3 }; - CheckedFrom c = "miopen_depthwise_convolution"; - auto output_t = miopen_depthwise_convolution_forward( - c, input, weight, padding, stride, dilation, groups, benchmark, deterministic); - if (bias->defined()) { - miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias); - } - return output_t; -} - -// --------------------------------------------------------------------- -// -// Convolution backward (bias) -// -// --------------------------------------------------------------------- - -Tensor miopen_convolution_backward_bias( - const Tensor& grad_output_t) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }; - - // TODO: Workaround since MIOpen does not support NHWC bias - // See #64426 - std::vector discard_dims; - for( int i = 0; i < grad_output_t.dim(); i++ ) { - if(i != output_channels_dim ) { - discard_dims.push_back(i); - } - } - - Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) ); - if( outputBias.dim() == 0 ) { - // always return a tensor of shape [_] - return outputBias.unsqueeze(0); - } - else { - return outputBias; - } - -/* MIOpen does not support NHWC bias. Activate once support is added. - auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options()); - - TensorArg grad_bias{ grad_bias_t, "result", 0 }; - - TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}), - static_cast(grad_output->dim())}; - TensorDescriptor odesc{*grad_output}; - - auto handle = getMiopenHandle(); - auto dataType = getMiopenDataType(*grad_bias); - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(), - &zero, bdesc.desc(), grad_bias->data_ptr())); - return *grad_bias; -*/ -} - -// --------------------------------------------------------------------- -// -// Convolution backward (weight) -// -// --------------------------------------------------------------------- - -void raw_miopen_convolution_backward_weight_out( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - - auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenConvolution; - - ConvolutionArgs args{ input, grad_output, grad_weight }; - args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(grad_weight, input.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); - - if (at::globalContext().immediateMiopen()) { - uint64_t solution_id; - Workspace workspace = chooseSolution(args, &solution_id); - - MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate( - args.handle, - args.odesc.desc(), grad_output.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id)); - } - else { - miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardWeights( - args.handle, - &one, args.odesc.desc(), grad_output.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), bwdFilterAlg, &zero, - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); - } -} - -//Depthwise backward weights. -void raw_miopen_depthwise_convolution_backward_weight_out( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - - auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenDepthwise; - - ConvolutionArgs args{ input, grad_output, grad_weight }; - args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(grad_weight, input.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); - - if (at::globalContext().immediateMiopen()) { - uint64_t solution_id; - Workspace workspace = chooseSolution(args, &solution_id); - - MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate( - args.handle, - args.odesc.desc(), grad_output.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id)); - } - else { - miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardWeights( - args.handle, - &one, args.odesc.desc(), grad_output.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), bwdFilterAlg, &zero, - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); - } -} - -Tensor miopen_depthwise_convolution_backward_weight( - CheckedFrom c, - IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - - checkAllSameType(c, {grad_output, input}); - checkAllSameGPU(c, {grad_output, input}); - - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *grad_output)) { - memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - - Tensor grad_output_contig_t = grad_output->contiguous(memory_format); - // Make sure that NC11 strides follow formula - grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format); - TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; - - Tensor input_contig_t = input->contiguous(memory_format); - input_contig_t.resize_(input_contig_t.sizes(), memory_format); - TensorArg input_contig{ input_contig_t, "input", 2}; - - auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format); - - // For uniformity with everything else, although it seems grad_weight - // would be unambiguous too. - TensorArg grad_weight{ grad_weight_t, "result", 0 }; - convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); - - raw_miopen_depthwise_convolution_backward_weight_out( - *grad_weight, *grad_output_contig, *input_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return grad_weight_t; -} - -Tensor miopen_depthwise_convolution_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - input{ input_t, "input", 2 }; - return miopen_depthwise_convolution_backward_weight( - "miopen_depthwise_convolution_backward_weight", - weight_size, grad_output, input, - padding, stride, dilation, groups, benchmark, deterministic); -} - -Tensor miopen_convolution_backward_weight( - CheckedFrom c, - IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - - checkAllSameType(c, {grad_output, input}); - checkAllSameGPU(c, {grad_output, input}); - - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *grad_output)) { - memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - - Tensor grad_output_contig_t = grad_output->contiguous(memory_format); - // Make sure that NC11 strides follow formula - grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format); - TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; - - Tensor input_contig_t = input->contiguous(memory_format); - input_contig_t.resize_(input_contig_t.sizes(), memory_format); - TensorArg input_contig{ input_contig_t, "input", 2}; - - auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format); - - // For uniformity with everything else, although it seems grad_weight - // would be unambiguous too. - TensorArg grad_weight{ grad_weight_t, "result", 0 }; - convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); - - raw_miopen_convolution_backward_weight_out( - *grad_weight, *grad_output_contig, *input_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return grad_weight_t; -} - -Tensor miopen_convolution_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - input{ input_t, "input", 2 }; - return miopen_convolution_backward_weight( - "miopen_convolution_backward_weight", - weight_size, grad_output, input, - padding, stride, dilation, groups, benchmark, deterministic); -} - -Tensor miopen_convolution_transpose_backward_input( - const Tensor& grad_output_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) -{ - TensorArg grad_output { grad_output_t, "grad_output", 1 }, - weight { weight_t, "weight", 2 }; - return miopen_convolution_forward( - "miopen_convolution_transpose_backward_input", - grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); -} - -Tensor miopen_convolution_transpose_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - input{ input_t, "input", 2 }; - return miopen_convolution_backward_weight( - "miopen_convolution_backward_weight", - weight_size, input, grad_output, - padding, stride, dilation, groups, benchmark, deterministic); -} - -std::tuple miopen_convolution_transpose_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, std::array output_mask) { - - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); - - Tensor grad_input, grad_weight, grad_bias; - if (output_mask[0]) { - grad_input = miopen_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[1]) { - grad_weight = miopen_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[2]) { - grad_bias = miopen_convolution_backward_bias(grad_output); - } - - return std::tuple{grad_input, grad_weight, grad_bias}; + + return std::tuple{grad_input, grad_weight, grad_bias}; } // --------------------------------------------------------------------- @@ -1218,23 +1099,50 @@ std::tuple miopen_convolution_transpose_backwa // // --------------------------------------------------------------------- -void raw_miopen_convolution_backward_input_out( +// See NOTE [ Backward vs transpose convolutions ] in aten/src/ATen/native/cudnn/ConvShared.cpp + +void raw_miopen_convolution_backward_input_out_32bit( const at::Tensor& grad_input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { auto dataType = getMiopenDataType(grad_output); - miopenConvolutionMode_t c_mode = miopenConvolution; + miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution; - ConvolutionArgs args{ grad_input, grad_output, weight }; + ConvolutionArgs args{grad_input, grad_output, weight}; args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(grad_input); - args.wdesc.set(weight, grad_output.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); + at::MemoryFormat memory_format = + miopen_conv_suggest_memory_format(grad_input, weight); + setConvolutionParams( + &args.params, + args.handle, + grad_input, + weight, + padding, + stride, + dilation, + groups, + deterministic, + memory_format); + args.idesc.set(grad_input, memory_format); + args.wdesc.set(weight, memory_format, 0); + args.odesc.set(grad_output, memory_format); + args.cdesc.set( + dataType, + c_mode, + grad_output.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + benchmark, + deterministic); if (at::globalContext().immediateMiopen()) { uint64_t solution_id; @@ -1245,7 +1153,10 @@ void raw_miopen_convolution_backward_input_out( args.odesc.desc(), grad_output.const_data_ptr(), args.wdesc.desc(), weight.const_data_ptr(), args.cdesc.desc(), - args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id)); + args.idesc.desc(), grad_input.mutable_data_ptr(), + workspace.data, + workspace.size, + solution_id)); } else { miopenConvBwdDataAlgorithm_t bwdDataAlg; @@ -1256,216 +1167,521 @@ void raw_miopen_convolution_backward_input_out( MIOPEN_CHECK(miopenConvolutionBackwardData( args.handle, - &one, args.odesc.desc(), grad_output.const_data_ptr(), + &one, + args.odesc.desc(), grad_output.const_data_ptr(), args.wdesc.desc(), weight.const_data_ptr(), - args.cdesc.desc(), bwdDataAlg, &zero, - args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size)); + args.cdesc.desc(), + bwdDataAlg, + &zero, + args.idesc.desc(), grad_input.mutable_data_ptr(), + workspace.data, + workspace.size)); } } -// see NOTE [ Backward vs transpose convolutions ] in src/Aten/native/cudnn/Conv.cpp +void raw_miopen_convolution_backward_input_out( + const at::Tensor& grad_input, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + split_batch_dim_to_32bit_out( + grad_input, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise, + 1024 * 1024 * 128, + raw_miopen_convolution_backward_input_out_32bit); +} Tensor miopen_convolution_backward_input( CheckedFrom c, - IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ + IntArrayRef input_size, + const TensorArg& grad_output, + const TensorArg& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { checkAllSameType(c, {grad_output, weight}); checkAllSameGPU(c, {grad_output, weight}); - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*grad_output, *weight)) { - memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - + auto memory_format = miopen_conv_suggest_memory_format(*grad_output, *weight); Tensor grad_input_t = at::detail::empty_cuda( input_size, grad_output->options().memory_format(memory_format)); // Avoid "grad_input" when this is being used as transposed convolution - TensorArg grad_input{ grad_input_t, "result", 0 }; - convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); + TensorArg grad_input{grad_input_t, "result", 0}; + convolution_shape_check( + c, grad_input, weight, grad_output, padding, stride, dilation, groups); - // See #4500 Tensor weight_contig = weight->contiguous(memory_format); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), memory_format); - Tensor grad_output_contig = grad_output->contiguous(memory_format); - grad_output_contig.resize_(grad_output_contig.sizes(), memory_format); raw_miopen_convolution_backward_input_out( - *grad_input, grad_output_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic); + *grad_input, + grad_output_contig, + weight_contig, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); return *grad_input; } -Tensor miopen_convolution_transpose_forward( - CheckedFrom c, - const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(), - padding, output_padding, stride, dilation, groups); - return miopen_convolution_backward_input(c, input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic); -} - +// overload Tensor miopen_convolution_backward_input( - IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - weight{ weight_t, "weight", 2 }; + IntArrayRef input_size, + const Tensor& grad_output_t, + const Tensor& weight_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + TensorArg grad_output{grad_output_t, "grad_output", 1}, + weight{weight_t, "weight", 2}; return miopen_convolution_backward_input( "miopen_convolution_backward_input", - input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic); + input_size, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); } -//Depthwise convolutions backward data. -void raw_miopen_depthwise_convolution_backward_input_out( - const at::Tensor& grad_input, - const at::Tensor& grad_output, - const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - - auto dataType = getMiopenDataType(grad_output); - miopenConvolutionMode_t c_mode = miopenDepthwise; +void raw_miopen_convolution_backward_weight_out_32bit( + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + auto dataType = getMiopenDataType(input); + miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution; - ConvolutionArgs args{ grad_input, grad_output, weight }; + ConvolutionArgs args{input, grad_output, grad_weight}; args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(grad_input); - args.wdesc.set(weight, grad_output.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); + at::MemoryFormat memory_format = + miopen_conv_suggest_memory_format(input, grad_weight); + setConvolutionParams( + &args.params, + args.handle, + input, + grad_weight, + padding, + stride, + dilation, + groups, + deterministic, + memory_format); + args.idesc.set(input, memory_format); + args.wdesc.set(grad_weight, memory_format, 0); + args.odesc.set(grad_output, memory_format); + args.cdesc.set( + dataType, + c_mode, + input.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + benchmark, + deterministic); if (at::globalContext().immediateMiopen()) { uint64_t solution_id; - Workspace workspace = chooseSolution(args, &solution_id); + Workspace workspace = chooseSolution(args, &solution_id); - MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate( + MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate( args.handle, args.odesc.desc(), grad_output.const_data_ptr(), - args.wdesc.desc(), weight.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), args.cdesc.desc(), - args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id)); + args.wdesc.desc(), grad_weight.data_ptr(), + workspace.data, + workspace.size, + solution_id)); + } + else { + miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenConvolutionBackwardWeights( + args.handle, + &one, + args.odesc.desc(), grad_output.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), + args.cdesc.desc(), + bwdFilterAlg, + &zero, + args.wdesc.desc(), grad_weight.data_ptr(), + workspace.data, + workspace.size)); + } +} + +void raw_miopen_convolution_backward_weight_out( + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + constexpr int64_t int_max = std::numeric_limits::max(); + const int64_t ni = input.numel(); + const int64_t no = grad_output.numel(); + // Assume the shape of the tensor is (N, C, D1, D2, ...) + // if N * C * D1 * D2 * ... <= int_max, then no need to split at all + if (ni <= int_max && no <= int_max) { + raw_miopen_convolution_backward_weight_out_32bit( + grad_weight, + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); + return; } - else { - miopenConvBwdDataAlgorithm_t bwdDataAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &bwdDataAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardData( - args.handle, - &one, args.odesc.desc(), grad_output.const_data_ptr(), - args.wdesc.desc(), weight.const_data_ptr(), - args.cdesc.desc(), bwdDataAlg, &zero, - args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size)); + // else, if C * D1 * D2 * ... <= int_max, then we just need to split across + // the N dimension + // + // Here we use a simple heuristics to determine the size of each split + // We don't max out the 2^31 address space because this number is super + // large and very likely to get an OOM. + int64_t n = grad_output.size(0); + int64_t max_inner_size = std::max(ni, no) / n; + int64_t split_size = + std::max(1024 * 1024 * 512 / max_inner_size, 1L); + int64_t num_splits = (n + split_size - 1) / split_size; + if (split_size * max_inner_size < int_max) { + const auto kAccType = (grad_weight.scalar_type() == kHalf || + grad_weight.scalar_type() == kBFloat16) + ? kFloat + : grad_weight.scalar_type(); + Tensor grad_weight_accumulator = + at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType)); + for (const auto i : c10::irange(num_splits)) { + int64_t start = split_size * i; + int64_t split_size_ = std::min(split_size, n - start); + Tensor input_ = input.narrow(0, start, split_size_); + Tensor grad_output_ = grad_output.narrow(0, start, split_size_); + Tensor grad_weight_ = at::empty_like(grad_weight); + raw_miopen_convolution_backward_weight_out_32bit( + grad_weight_, + grad_output_, + input_, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); + grad_weight_accumulator.add_(grad_weight_); + } + grad_weight.copy_(grad_weight_accumulator); + return; } + // If control flow reaches here, this means even splitting N is not enough, + // then things starts to become complicated: For example, for conv2d, there + // following questions needs to be considered. + // - Is the memory layout NCHW or NHWC ? + // - If the conv is NCHW -> NC'H'W', then should we + // - split only NC? + // - split only N'C'? + // - split both? + // - If the conv is NHWC, then we need to split across H, we need to be very + // careful about the boundary condition + // to make sure that the boundary is handled correctly. + // - If we decide to make these splits, is the memory contiguous? Do we need + // to copy the memory? Considering the complexity of this issue, it is better + // not to use cuDNN for this case + TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN."); } -Tensor miopen_depthwise_convolution_backward_input( +Tensor miopen_convolution_backward_weight( CheckedFrom c, - IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - checkAllSameType(c, {grad_output, weight}); - checkAllSameGPU(c, {grad_output, weight}); + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + auto memory_format = miopen_conv_suggest_memory_format(input_t, grad_output_t); - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*grad_output, *weight)) { - memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } + Tensor grad_output_contig_t = grad_output_t.contiguous(memory_format); + TensorArg grad_output_contig{grad_output_contig_t, "grad_output", 1}; - Tensor grad_input_t = at::detail::empty_cuda( - input_size, grad_output->options().memory_format(memory_format)); + Tensor input_contig_t = input_t.contiguous(memory_format); + TensorArg input{input_contig_t, "input", 2}; - TensorArg grad_input{ grad_input_t, "result", 0 }; - convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); + checkAllSameType(c, {grad_output_contig, input}); + checkAllSameGPU(c, {grad_output_contig, input}); - // See #4500 - Tensor weight_contig = weight->contiguous(memory_format); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), memory_format); + auto grad_weight_t = + at::empty(weight_size, grad_output_contig->options(), memory_format); - Tensor grad_output_contig = grad_output->contiguous(memory_format); - grad_output_contig.resize_(grad_output_contig.sizes(), memory_format); + // For uniformity with everything else, although it seems grad_weight + // would be unambiguous too. + TensorArg grad_weight{grad_weight_t, "result", 0}; + convolution_shape_check( + c, + input, + grad_weight, + grad_output_contig, + padding, + stride, + dilation, + groups); - raw_miopen_depthwise_convolution_backward_input_out( - *grad_input, grad_output_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic); + raw_miopen_convolution_backward_weight_out( + *grad_weight, + *grad_output_contig, + *input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); - return *grad_input; + return grad_weight_t; } -Tensor miopen_depthwise_convolution_backward_input( - IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - weight{ weight_t, "weight", 2 }; - return miopen_depthwise_convolution_backward_input( - "miopen_depthwise_convolution_backward_input", - input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic); +// overload +Tensor miopen_convolution_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + return miopen_convolution_backward_weight( + "miopen_convolution_backward_weight", + weight_size, + grad_output_t, + input_t, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); } -std::tuple miopen_convolution_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, std::array output_mask) { - - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); +std::tuple miopen_convolution_backward( + const at::Tensor& input, + const at::Tensor& grad_output_t, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + std::array output_mask) { + Tensor grad_output = grad_output_t.to(input.suggest_memory_format()); Tensor grad_input, grad_weight, grad_bias; - if (output_mask[0]) { - grad_input = miopen_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[1]) { - grad_weight = miopen_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[2]) { - grad_bias = miopen_convolution_backward_bias(grad_output); + if (input.numel() == 0) { + if (output_mask[0]) { + grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (output_mask[1]) { + grad_weight = at::zeros_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (output_mask[2]) { + grad_bias = at::zeros_like(grad_output_t, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + } else { + if (output_mask[0]) { + grad_input = miopen_convolution_backward_input( + input.sizes(), + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); + } + if (output_mask[1]) { + grad_weight = miopen_convolution_backward_weight( + weight.sizes(), + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); + } + if (output_mask[2]) { + grad_bias = miopen_convolution_backward_bias(grad_output); + } } - return std::tuple{grad_input, grad_weight, grad_bias}; + return std::tuple{grad_input, grad_weight, grad_bias}; } -std::tuple miopen_depthwise_convolution_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, std::array output_mask) { +Tensor miopen_convolution_transpose_forward( + CheckedFrom c, + const TensorArg& grad_output, + const TensorArg& weight, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { + auto input_size = conv_input_size( + grad_output->sizes(), + weight->sizes(), + padding, + output_padding, + stride, + dilation, + groups); + return miopen_convolution_backward_input( + c, + input_size, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); +} - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); +Tensor miopen_convolution_transpose_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { + return miopen_convolution_backward_weight( + "miopen_convolution_backward_weight", + weight_size, + input_t, + grad_output_t, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); +} - Tensor grad_input, grad_weight, grad_bias; - if (output_mask[0]) { - grad_input = miopen_depthwise_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[1]) { - grad_weight = miopen_depthwise_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[2]) { - grad_bias = miopen_convolution_backward_bias(grad_output); - } +Tensor miopen_convolution_transpose( + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_t_opt, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt); + const Tensor& bias_t = *bias_t_maybe_owned; - return std::tuple{grad_input, grad_weight, grad_bias}; + TensorArg input{input_t, "input", 1}, weight{weight_t, "weight", 2}, bias{bias_t, "bias", 3}; + CheckedFrom c = "miopen_convolution_transpose"; + auto output_t = miopen_convolution_transpose_forward( + c, + input, + weight, + padding, + output_padding, + stride, + dilation, + groups, + benchmark, + deterministic); + if (bias->defined()) { + miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + } + return output_t; } -Tensor miopen_convolution_transpose( - const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) +// --------------------------------------------------------------------- +// +// Convolution depthwise +// +// --------------------------------------------------------------------- + +Tensor miopen_depthwise_convolution( + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_t_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt); @@ -1474,16 +1690,86 @@ Tensor miopen_convolution_transpose( TensorArg input { input_t, "input", 1 }, weight { weight_t, "weight", 2 }, bias { bias_t, "bias", 3 }; - CheckedFrom c = "miopen_convolution_transpose"; - auto output_t = miopen_convolution_transpose_forward( - c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic); + CheckedFrom c = "miopen_depthwise_convolution"; + auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input_t.options().memory_format(memory_format)); + if (output_t.numel() == 0) { + return output_t; + } + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, + c, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + true); if (bias->defined()) { - miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + miopen_convolution_add_bias_(c, output, bias); } - return output_t; + return *output; } -// MIOpen fused convolution bias activation forward +std::tuple miopen_depthwise_convolution_backward( + const at::Tensor& input, + const at::Tensor& grad_output_t, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + std::array output_mask) { + Tensor grad_output = grad_output_t.to(input.suggest_memory_format()); + + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = miopen_convolution_backward_input( + input.sizes(), + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + true); + } + if (output_mask[1]) { + grad_weight = miopen_convolution_backward_weight( + weight.sizes(), + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + true); + } + if (output_mask[2]) { + grad_bias = miopen_convolution_backward_bias(grad_output); + } + + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +// --------------------------------------------------------------------- +// fusions +// --------------------------------------------------------------------- + void raw_miopen_convolution_relu_out( const Tensor& output, const Tensor& input, @@ -1495,17 +1781,35 @@ void raw_miopen_convolution_relu_out( int64_t groups, bool benchmark, bool deterministic) { - auto dataType = getMiopenDataType(input); miopenConvolutionMode_t c_mode = miopenConvolution; - ConvolutionArgs args{ input, output, weight }; args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(weight, input.suggest_memory_format(), 0); - args.odesc.set(output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); + at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight); + setConvolutionParams( + &args.params, + args.handle, + input, + weight, + padding, + stride, + dilation, + groups, + deterministic, + memory_format); + args.idesc.set(input, memory_format); + args.wdesc.set(weight, memory_format, 0); + args.odesc.set(output, memory_format); + args.cdesc.set( + dataType, + c_mode, + input.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + benchmark, + deterministic); TensorDescriptor bdesc; bdesc.set(bias.expand({1, bias.size(0)}), output.dim()); @@ -1549,8 +1853,8 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m } Tensor miopen_convolution_add_relu( - const Tensor& input, - const Tensor& weight, + const Tensor& input_t, + const Tensor& weight_t, const Tensor& z, const std::optional& alpha, const std::optional& bias, @@ -1562,17 +1866,28 @@ Tensor miopen_convolution_add_relu( // MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function: // y = act ( alpha1 * conv(x) + alpha2 * z + bias ) - auto memory_format = input.suggest_memory_format(); + auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); auto& ctx = at::globalContext(); bool benchmark = ctx.benchmarkCuDNN(); - TensorArg input_arg { input, "input", 1 }, - weight_arg { weight, "weight", 2 }; - auto output = miopen_convolution_forward( + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }; + + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input_t.options().memory_format(memory_format)); + if (output_t.numel() == 0){ + return output_t; + } + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, "miopen_convolution_add_relu", - input_arg, - weight_arg, + input, + weight, padding, stride, dilation, @@ -1581,53 +1896,51 @@ Tensor miopen_convolution_add_relu( false // deterministic ); - auto contig_output = self_or_new_memory_format(output, memory_format); + auto contig_output_t = self_or_new_memory_format(output_t, memory_format); - if (!output.is_same(contig_output)) { - contig_output.copy_(output); + if (!output_t.is_same(contig_output_t)) { + contig_output_t.copy_(output_t); } auto _alpha = alpha.has_value() ? alpha.value().to() : 1.0; auto _bias = bias.has_value() ? bias.value() : at::zeros( - {contig_output.size(1)}, - optTypeMetaToScalarType(contig_output.options().dtype_opt()), - contig_output.options().layout_opt(), - contig_output.options().device_opt(), - contig_output.options().pinned_memory_opt()); + {contig_output_t.size(1)}, + optTypeMetaToScalarType(contig_output_t.options().dtype_opt()), + contig_output_t.options().layout_opt(), + contig_output_t.options().device_opt(), + contig_output_t.options().pinned_memory_opt()); - at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input.dim(), _bias).add(z, _alpha); - contig_output.add_(alpha_mul_z_add_bias); - contig_output.relu_(); + at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha); + contig_output_t.add_(alpha_mul_z_add_bias); + contig_output_t.relu_(); - return contig_output; + return contig_output_t; } Tensor miopen_convolution_relu( - const Tensor& input, - const Tensor& weight, + const Tensor& input_t, + const Tensor& weight_t, const std::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, int64_t groups) { - auto memory_format = input.suggest_memory_format(); - auto& ctx = at::globalContext(); bool benchmark = ctx.benchmarkCuDNN(); // MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d - if (input.suggest_memory_format() == at::MemoryFormat::Contiguous - && input.scalar_type() == at::kFloat - && input.ndimension() == 4) { + if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous + && input_t.scalar_type() == at::kFloat + && input_t.ndimension() == 4) { // FuseFrozenConvAddRelu performs some tensor shape checking Tensor output_t = at::detail::empty_cuda( conv_output_size( - input.sizes(), weight.sizes(), padding, stride, dilation), - input.options().memory_format(input.suggest_memory_format())); + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input_t.options().memory_format(input_t.suggest_memory_format())); if (output_t.numel() == 0) { return output_t; } @@ -1643,8 +1956,8 @@ Tensor miopen_convolution_relu( raw_miopen_convolution_relu_out( output_t, - input, - weight, + input_t, + weight_t, _bias, stride, padding, @@ -1659,12 +1972,25 @@ Tensor miopen_convolution_relu( else { // fallback - TensorArg input_arg { input, "input", 1 }, - weight_arg { weight, "weight", 2 }; - auto output = miopen_convolution_forward( + auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); + + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }; + + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input->options().memory_format(memory_format)); + if (output_t.numel() == 0){ + return output_t; + } + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, "miopen_convolution_relu", - input_arg, - weight_arg, + input, + weight, padding, stride, dilation, @@ -1673,26 +1999,26 @@ Tensor miopen_convolution_relu( false // deterministic ); - auto contig_output = self_or_new_memory_format(output, memory_format); + auto contig_output_t = self_or_new_memory_format(output_t, memory_format); - if (!output.is_same(contig_output)) { - contig_output.copy_(output); + if (!output_t.is_same(contig_output_t)) { + contig_output_t.copy_(output_t); } auto _bias = bias.has_value() ? bias.value() : at::zeros( - {contig_output.size(1)}, - optTypeMetaToScalarType(contig_output.options().dtype_opt()), - contig_output.options().layout_opt(), - contig_output.options().device_opt(), - contig_output.options().pinned_memory_opt()); + {contig_output_t.size(1)}, + optTypeMetaToScalarType(contig_output_t.options().dtype_opt()), + contig_output_t.options().layout_opt(), + contig_output_t.options().device_opt(), + contig_output_t.options().pinned_memory_opt()); - at::Tensor reshaped_bias = at::native::reshape_bias(input.dim(), _bias); - contig_output.add_(reshaped_bias); - contig_output.relu_(); + at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias); + contig_output_t.add_(reshaped_bias); + contig_output_t.relu_(); - return contig_output; + return contig_output_t; } } diff --git a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp index 813db7a97ef9f..873005b3dd2bc 100644 --- a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -49,7 +50,7 @@ bool check_no_grad(sdp::sdp_params const& params, bool debug) { return !any_inputs_require_grad || !gradmode_enabled; } -bool use_overrideable_xpu(sdp::sdp_params const& params, bool debug) { +bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) { constexpr auto supported_dtypes = c10::array_of( at::kFloat, at::kBFloat16, at::kHalf); // double is not supported @@ -73,6 +74,42 @@ bool use_overrideable_xpu(sdp::sdp_params const& params, bool debug) { return sdp::check_tensor_dtype(params, supported_dtypes, debug); } +bool can_use_flash_attention(sdp::sdp_params const& params, bool debug) { + // Currently, XPU fallbacks flash attention to overrideable + return can_use_overrideable_attention(params, debug); +} + +bool can_use_cudnn_attention(sdp::sdp_params const& params, bool debug) { + if (debug) { + TORCH_WARN("XPU don't support SDPA cudnn attention backend."); + } + return false; +} + +bool can_use_mem_efficien_attention(sdp::sdp_params const& params, bool debug) { + if (debug) { + TORCH_WARN("XPU don't support SDPA mem efficient attention backend."); + } + return false; +} + +bool priority_order_init = false; + +std::array priority_order( + sdp::sdp_params const& params) { + if (!priority_order_init) { + priority_order_init = true; + const std::vector priority_order = { + static_cast(at::SDPBackend::overrideable), + static_cast(at::SDPBackend::math), + static_cast(at::SDPBackend::flash_attention), + static_cast(at::SDPBackend::efficient_attention), + static_cast(at::SDPBackend::cudnn_attention)}; + at::globalContext().setSDPPriorityOrder(priority_order); + } + return at::globalContext().sDPPriorityOrder(); +} + sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) { // This function defines the priority order of the different sdp backends // 1. Flash Attention @@ -85,20 +122,16 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) { } // Get ideal kernel ordering - const std::array priority_order{ - sdp::SDPBackend::overrideable, - sdp::SDPBackend::math, - sdp::SDPBackend::flash_attention, - }; + const auto ordering = priority_order(kernel_params); // Because TORCHCHECK checks if condition is true we negate debug so that // The statements will be printed when debug is true bool print_debug = false; - for (auto& backend : priority_order) { + for (auto& backend : ordering) { switch (backend) { case sdp::SDPBackend::overrideable: if (ctx.userEnabledOverrideableSDP() && - use_overrideable_xpu(kernel_params, print_debug)) { + can_use_overrideable_attention(kernel_params, print_debug)) { return sdp::SDPBackend::overrideable; } break; @@ -109,25 +142,43 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) { break; case sdp::SDPBackend::flash_attention: if (ctx.userEnabledFlashSDP() && - use_overrideable_xpu(kernel_params, print_debug)) { - TORCH_WARN( - "Flash Attention is not supported on XPU, falling back to overrideable kernel."); + can_use_flash_attention(kernel_params, print_debug)) { + TORCH_WARN_ONCE( + "SDPA Flash Attention backend is not supported on XPU, falling back to OVERRIDEABLE backend."); return sdp::SDPBackend::overrideable; } break; + case sdp::SDPBackend::cudnn_attention: + if (ctx.userEnabledCuDNNSDP() && + can_use_cudnn_attention(kernel_params, print_debug)) { + TORCH_CHECK(false, "Invalid backend"); + } + break; + case sdp::SDPBackend::efficient_attention: + if (ctx.userEnabledMemEfficientSDP() && + can_use_mem_efficien_attention(kernel_params, print_debug)) { + TORCH_CHECK(false, "Invalid backend"); + } + break; default: TORCH_CHECK(false, "Invalid backend"); } } // If we have gotten to this point then two things have happened: - // 1. use_overrideable_xpu did not satisfy the constraints to be ran + // 1. can_use_overrideable_attention did not satisfy the constraints to be ran // 2. The user has explicitly disabled the math kernel // We then re-run the kernel checks with debug enabled to print out the // reason why the kernel was not selected print_debug = true; - TORCH_WARN("OneDNN kernel not used because:"); - use_overrideable_xpu(kernel_params, print_debug); + TORCH_WARN("Flash attention kernel not used because:"); + can_use_flash_attention(kernel_params, print_debug); + TORCH_WARN("Overrideable attention kernel not used because:"); + can_use_overrideable_attention(kernel_params, print_debug); + TORCH_WARN("CuDNN attention kernel not used because:"); + can_use_cudnn_attention(kernel_params, print_debug); + TORCH_WARN("Memory Efficient attention kernel not used because:"); + can_use_mem_efficien_attention(kernel_params, print_debug); TORCH_CHECK(!print_debug, "No available kernel. Aborting execution.") return sdp::SDPBackend::error; } @@ -209,7 +260,7 @@ _scaled_dot_product_fused_attention_overrideable_xpu( alloc_with_matching_layout(query, output, output_shape); at::Tensor logsumexp, debug_attn_mask; // not supported - at::native::onednn::gpu_float_sdpa( + at::native::onednn::sdpa( batch_size, seq_len_q, seq_len_kv, @@ -223,7 +274,9 @@ _scaled_dot_product_fused_attention_overrideable_xpu( attn_bias, is_causal, scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)), - output); + output, + false, + logsumexp); // rng not used auto philox_seed = at::empty({}, at::dtype(at::kLong)); diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp index 1d90711f6e382..e840e21f4f7a1 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp @@ -13,6 +13,9 @@ using dims = logical_tensor::dims; using op = dnnl::graph::op; using partition = dnnl::graph::partition; +constexpr logical_tensor::data_type sdpa_intermediate_dtype = + logical_tensor::data_type::f32; + inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) { return scalar_type == c10::ScalarType::Float ? data_type::f32 : scalar_type == c10::ScalarType::Half ? data_type::f16 @@ -20,6 +23,8 @@ inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) { : data_type::undef; } +namespace sdpa_forward { + struct SDPALogicalParams { enum class TensorID { query, @@ -28,7 +33,8 @@ struct SDPALogicalParams { neg_inf, attn_mask, value, - output, + attention, + logsumexp, end, }; @@ -38,14 +44,16 @@ struct SDPALogicalParams { std::optional neg_inf; std::optional attn_mask; logical_tensor value{}; - logical_tensor output{}; + logical_tensor attention{}; + std::optional logsumexp; SDPALogicalParams( const at::Tensor& query_, const at::Tensor& key_, const at::Tensor& value_, const std::optional& attn_mask_, - const at::Tensor& output_, + const at::Tensor& attention_, + const at::Tensor& logsumexp_, int batch_size, int seq_len_q, int seq_len_kv, @@ -53,19 +61,26 @@ struct SDPALogicalParams { int num_head_kv, int head_dim_qk, int head_dim_v, - bool is_causal) { + bool is_causal, + bool compute_logsumexp) { const data_type dtype = to_logical_tensor_data_type(query_.scalar_type()); TORCH_INTERNAL_ASSERT( (dtype != data_type::undef), "Only FP16/BF16/FP32 datatypes are currently supported"); + TORCH_INTERNAL_ASSERT( + query_.scalar_type() == attention_.scalar_type(), + "scaled_dot_product_attention_xpu: query and attention tensors should have the same data type."); const dims scalar_shape = {1}; - std::vector inputLogicalTensors; at::Tensor reshaped_query = query_; at::Tensor reshaped_key = key_; at::Tensor reshaped_value = value_; - at::Tensor reshaped_output = output_; + at::Tensor reshaped_attention = attention_; + at::Tensor reshaped_logsumexp = + compute_logsumexp ? logsumexp_.unsqueeze(-1) : logsumexp_; at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor()); + + // handle broadcasted input tensors for OneDNN if (at::native::onednn::is_broadcast(reshaped_query)) { at::native::onednn::undo_broadcast(reshaped_query); } @@ -75,9 +90,6 @@ struct SDPALogicalParams { if (at::native::onednn::is_broadcast(reshaped_value)) { at::native::onednn::undo_broadcast(reshaped_value); } - if (at::native::onednn::is_broadcast(reshaped_output)) { - at::native::onednn::undo_broadcast(reshaped_output); - } if (attn_mask_.has_value() && at::native::onednn::is_broadcast(reshaped_attn_mask)) { at::native::onednn::undo_broadcast(reshaped_attn_mask); @@ -95,23 +107,22 @@ struct SDPALogicalParams { {batch_size, group_num, group_size, seq_len_q, head_dim_qk}); reshaped_key = key_.unsqueeze(2); reshaped_value = value_.unsqueeze(2); - reshaped_output = output_.view( + reshaped_attention = attention_.view( {batch_size, group_num, group_size, seq_len_q, head_dim_v}); if (attn_mask_.has_value() && attn_mask_.value().dim() == 4) { reshaped_attn_mask = attn_mask_.value().unsqueeze(2); } } - query = { - static_cast(TensorID::query), - dtype, - reshaped_query.sizes().vec(), - reshaped_query.strides().vec()}; - key = { - static_cast(TensorID::key), - dtype, - reshaped_key.sizes().vec(), - reshaped_key.strides().vec()}; +#define LOGIC_TENSOR_DESC(name, dtype) \ + name = { \ + static_cast(TensorID::name), \ + dtype, \ + reshaped_##name.sizes().vec(), \ + reshaped_##name.strides().vec()} + + LOGIC_TENSOR_DESC(query, dtype); + LOGIC_TENSOR_DESC(key, dtype); scale = { static_cast(TensorID::scale), to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())), @@ -132,22 +143,19 @@ struct SDPALogicalParams { TORCH_INTERNAL_ASSERT( (mask_dtype != data_type::undef), "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask"); - attn_mask = { - static_cast(TensorID::attn_mask), - mask_dtype, - reshaped_attn_mask.sizes().vec(), - reshaped_attn_mask.strides().vec()}; + LOGIC_TENSOR_DESC(attn_mask, mask_dtype); } - value = { - static_cast(TensorID::value), - dtype, - reshaped_value.sizes().vec(), - reshaped_value.strides().vec()}; - output = { - static_cast(TensorID::output), - dtype, - reshaped_output.sizes().vec(), - reshaped_output.strides().vec()}; + LOGIC_TENSOR_DESC(value, dtype); + LOGIC_TENSOR_DESC(attention, dtype); + if (compute_logsumexp) { + TORCH_INTERNAL_ASSERT( + logsumexp_.scalar_type() == at::kFloat, + "scaled_dot_product_attention: Expected logsumexp data type in FP32, but got ", + logsumexp_.scalar_type(), + " instead."); + LOGIC_TENSOR_DESC(logsumexp, sdpa_intermediate_dtype); + } +#undef LOGIC_TENSOR_DESC } std::vector get_input() const { std::vector input = {query, key, scale}; @@ -161,16 +169,21 @@ struct SDPALogicalParams { return input; } std::vector get_output() const { - return {output}; + std::vector output; + output.push_back(attention); + if (logsumexp.has_value()) { + output.push_back(logsumexp.value()); + } + return output; } }; partition create_sdpa_graph_partition( bool is_causal, + bool compute_logsumexp, data_type dtype, const SDPALogicalParams& params) { // graph building and partitioning - // currently, we assume that Q and K have same sequence length size_t lt_id = static_cast(SDPALogicalParams::TensorID::end); size_t op_id = 0; @@ -180,7 +193,7 @@ partition create_sdpa_graph_partition( // Matrix Extensions (Intel(R) XMX) support, which means the // Q/K/V tensors have bf16 or f16 data type while the output of the first // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type. - logical_tensor matmul_qk_out{lt_id++, data_type::f32}; + logical_tensor matmul_qk_out{lt_id++, sdpa_intermediate_dtype}; op matmul_qk{ op_id++, op::kind::MatMul, @@ -189,7 +202,7 @@ partition create_sdpa_graph_partition( "matmul_qk"}; matmul_qk.set_attr(op::attr::transpose_b, true); - logical_tensor scaled_qk_out{lt_id++, data_type::f32}; + logical_tensor scaled_qk_out{lt_id++, sdpa_intermediate_dtype}; op scale_mul{ op_id++, op::kind::Multiply, @@ -214,7 +227,7 @@ partition create_sdpa_graph_partition( if (params.attn_mask.has_value()) { TORCH_INTERNAL_ASSERT( !is_causal, "Additive mask cannot use with is_causal."); - masked_qk_out = {lt_id++, data_type::f32}; + masked_qk_out = {lt_id++, sdpa_intermediate_dtype}; mask_add = { op_id++, op::kind::Add, @@ -249,7 +262,7 @@ partition create_sdpa_graph_partition( {mask_gt_out.value()}, "mask_gt"}; - masked_qk_out = {lt_id++, data_type::f32}; + masked_qk_out = {lt_id++, sdpa_intermediate_dtype}; mask_select = { op_id++, op::kind::Select, @@ -270,12 +283,15 @@ partition create_sdpa_graph_partition( logical_tensor softmax_out{lt_id++, dtype}; softmax.add_input(masked_qk_out.value_or(scaled_qk_out)); softmax.add_output(softmax_out); + if (compute_logsumexp) { + softmax.add_output(params.logsumexp.value()); + } op matmul_v{ op_id++, op::kind::MatMul, {softmax_out, params.value}, - {params.output}, + {params.attention}, "matmul_v"}; constexpr auto ekind = dnnl::engine::kind::gpu; @@ -304,44 +320,469 @@ partition create_sdpa_graph_partition( partition& find_or_create_graph_partition( bool is_causal, + bool compute_logsumexp, const SDPALogicalParams& params) { - thread_local static PartitionCache cache; + thread_local PartitionCache cache; const data_type dtype = params.query.get_data_type(); // cache key creation // patternID is determined on the basis of the arguments provided std::bitset<32> patternID; if (dtype == data_type::f32) { - // bit 3 corresponds to float32 dtype - patternID.set(3, 1); + patternID.set(static_cast(PartitionCache::BitType::Float32), 1); } if (dtype == data_type::bf16) { - // bit 2 corresponds to fp16/bf16 dtype - patternID.set(2, 1); + patternID.set(static_cast(PartitionCache::BitType::Bfloat16), 1); } // sdp pattern - patternID.set(4, 1); + patternID.set(static_cast(PartitionCache::BitType::SdpaPattern), 1); // Refer to comments in Utils.h. The first 8 bits are reserved int pos = 8; // attn_mask patternID.set(pos++, params.attn_mask.has_value()); patternID.set(pos++, is_causal); + // compute_logsumexp + patternID.set(pos++, compute_logsumexp); auto partition_ = cache.find_partition(patternID); if (!partition_.has_value()) { // partition cache no hit // graph building and partitioning - partition sdp_partition = - create_sdpa_graph_partition(is_causal, dtype, params); + partition sdp_partition = create_sdpa_graph_partition( + is_causal, compute_logsumexp, dtype, params); partition_ = cache.insert_partition_cache(patternID, sdp_partition); } return *partition_; } +} // namespace sdpa_forward + +namespace sdpa_backward { + +struct SDPABackwardLogicalParams { + enum class TensorID { + grad_out, + query, + key, + value, + out, + logsumexp, + scale, + neg_inf, + attn_mask, + grad_query, + grad_key, + grad_value, + end, + }; + + logical_tensor grad_out{}; + logical_tensor query{}; + logical_tensor key{}; + logical_tensor value{}; + logical_tensor out{}; + logical_tensor logsumexp{}; + logical_tensor scale{}; + std::optional neg_inf; + std::optional attn_mask; + logical_tensor grad_query{}; + logical_tensor grad_key{}; + logical_tensor grad_value{}; + + SDPABackwardLogicalParams( + const at::Tensor& grad_out_, + const at::Tensor& query_, + const at::Tensor& key_, + const at::Tensor& value_, + const at::Tensor& out_, + const at::Tensor& logsumexp_, + const std::optional& attn_mask_, + const at::Tensor& grad_query_, + const at::Tensor& grad_key_, + const at::Tensor& grad_value_, + int batch_size, + int num_head_q, + int num_head_kv, + int seq_len_q, + int seq_len_kv, + int head_dim_qk, + int head_dim_v, + bool is_causal) { + const data_type dtype = to_logical_tensor_data_type(query_.scalar_type()); + TORCH_INTERNAL_ASSERT( + (dtype != data_type::undef), + "Only FP16/BF16/FP32 datatypes are currently supported"); + TORCH_INTERNAL_ASSERT( + grad_out_.scalar_type() == query_.scalar_type() && + grad_out_.scalar_type() == key_.scalar_type() && + grad_out_.scalar_type() == value_.scalar_type() && + grad_out_.scalar_type() == out_.scalar_type(), + "scaled_dot_product_attention_backward_xpu: Expected grad_out, q, k, v and out to have the same data type, but got ", + " grad_out: ", + grad_out_.scalar_type(), + ", q: ", + query_.scalar_type(), + ", k: ", + key_.scalar_type(), + ", v: ", + value_.scalar_type(), + ", out: ", + out_.scalar_type()); + TORCH_INTERNAL_ASSERT( + logsumexp_.defined() && logsumexp_.scalar_type() == at::kFloat, + "scaled_dot_product_attention_backward_xpu: Expected logsumexp to be defined and have FP32 data type"); + const dims scalar_shape = {1}; + + at::Tensor reshaped_grad_out = grad_out_; + at::Tensor reshaped_query = query_; + at::Tensor reshaped_key = key_; + at::Tensor reshaped_value = value_; + at::Tensor reshaped_out = out_; + at::Tensor reshaped_logsumexp = logsumexp_.unsqueeze(-1); + at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor()); + at::Tensor reshaped_grad_query = grad_query_; + at::Tensor reshaped_grad_key = grad_key_; + at::Tensor reshaped_grad_value = grad_value_; + + // handle broadcasted input tensors for OneDNN + if (at::native::onednn::is_broadcast(reshaped_grad_out)) { + at::native::onednn::undo_broadcast(reshaped_grad_out); + } + if (at::native::onednn::is_broadcast(reshaped_query)) { + at::native::onednn::undo_broadcast(reshaped_query); + } + if (at::native::onednn::is_broadcast(reshaped_key)) { + at::native::onednn::undo_broadcast(reshaped_key); + } + if (at::native::onednn::is_broadcast(reshaped_value)) { + at::native::onednn::undo_broadcast(reshaped_value); + } + if (attn_mask_.has_value() && + at::native::onednn::is_broadcast(reshaped_attn_mask)) { + at::native::onednn::undo_broadcast(reshaped_attn_mask); + } + + // TODO: Support GQA in backward pass once OneDNN supports it. + +#define LOGIC_TENSOR_DESC(name, dtype) \ + name = { \ + static_cast(TensorID::name), \ + dtype, \ + reshaped_##name.sizes().vec(), \ + reshaped_##name.strides().vec()} + + LOGIC_TENSOR_DESC(grad_out, dtype); + LOGIC_TENSOR_DESC(query, dtype); + LOGIC_TENSOR_DESC(key, dtype); + LOGIC_TENSOR_DESC(value, dtype); + LOGIC_TENSOR_DESC(out, dtype); + LOGIC_TENSOR_DESC(logsumexp, sdpa_intermediate_dtype); + scale = { + static_cast(TensorID::scale), + to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())), + scalar_shape, + logical_tensor::layout_type::strided, + logical_tensor::property_type::constant}; + if (is_causal) { + neg_inf = { + static_cast(TensorID::neg_inf), + to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())), + scalar_shape, + logical_tensor::layout_type::strided, + logical_tensor::property_type::constant}; + } + if (attn_mask_.has_value()) { + const data_type mask_dtype = + to_logical_tensor_data_type(attn_mask_->scalar_type()); + TORCH_INTERNAL_ASSERT( + (mask_dtype != data_type::undef), + "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask"); + LOGIC_TENSOR_DESC(attn_mask, mask_dtype); + } + LOGIC_TENSOR_DESC(grad_query, dtype); + LOGIC_TENSOR_DESC(grad_key, dtype); + LOGIC_TENSOR_DESC(grad_value, dtype); +#undef LOGIC_TENSOR_DESC + } + std::vector get_input() const { + std::vector input = { + grad_out, query, key, value, out, logsumexp, scale}; + if (neg_inf.has_value()) { + input.push_back(neg_inf.value()); + } + if (attn_mask.has_value()) { + input.push_back(attn_mask.value()); + } + return input; + } + std::vector get_output() const { + std::vector output = {grad_query, grad_key, grad_value}; + return output; + } +}; + +partition create_sdpa_backward_graph_partition( + bool is_causal, + data_type dtype, + const SDPABackwardLogicalParams& params) { + // graph building and partitioning + size_t lt_id = static_cast(SDPABackwardLogicalParams::TensorID::end); + size_t op_id = 0; + + // OneDNN graph has optimized implementation for `f16` or `bf16` SDPA with + // `f32` intermediate data type on Intel Graphics Products with Intel(R) Xe + // Matrix Extensions (Intel(R) XMX) support, which means the + // Q/K/V tensors have bf16 or f16 data type while the output of the first + // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type. + logical_tensor matmul_qk_out{lt_id++, sdpa_intermediate_dtype}; + op matmul_qk{ + op_id++, + op::kind::MatMul, + {params.query, params.key}, + {matmul_qk_out}, + "matmul_qk"}; + matmul_qk.set_attr(op::attr::transpose_b, true); + + logical_tensor scaled_qk_out{lt_id++, sdpa_intermediate_dtype}; + op scale_mul{ + op_id++, + op::kind::Multiply, + {matmul_qk_out, params.scale}, + {scaled_qk_out}, + "scale_mul"}; + + std::optional masked_qk_out; + + // For optional additive mask + std::optional mask_add; + + // For optional implicite causal mask + std::optional mask_gen_idx_row; + std::optional mask_row_idx; + std::optional mask_gen_idx_col; + std::optional mask_col_idx; + std::optional mask_gt; + std::optional mask_gt_out; + std::optional mask_select; + + if (params.attn_mask.has_value()) { + TORCH_INTERNAL_ASSERT( + !is_causal, "Additive mask cannot use with is_causal."); + masked_qk_out = {lt_id++, sdpa_intermediate_dtype}; + mask_add = { + op_id++, + op::kind::Add, + {scaled_qk_out, params.attn_mask.value()}, + {masked_qk_out.value()}, + "mask_add"}; + } else if (is_causal) { + mask_row_idx = {lt_id++, data_type::s32}; + mask_gen_idx_row = { + op_id++, + op::kind::GenIndex, + {scaled_qk_out}, + {mask_row_idx.value()}, + "mask_gen_idx_row"}; + mask_gen_idx_row->set_attr(op::attr::axis, -2); + + mask_col_idx = {lt_id++, data_type::s32}; + mask_gen_idx_col = { + op_id++, + op::kind::GenIndex, + {scaled_qk_out}, + {mask_col_idx.value()}, + "mask_gen_idx_col"}; + mask_gen_idx_col->set_attr(op::attr::axis, -1); + + mask_gt_out = {lt_id++, data_type::boolean}; + mask_gt = { + op_id++, + op::kind::GreaterEqual, + {mask_row_idx.value(), mask_col_idx.value()}, + {mask_gt_out.value()}, + "mask_gt"}; + + masked_qk_out = {lt_id++, sdpa_intermediate_dtype}; + mask_select = { + op_id++, + op::kind::Select, + {mask_gt_out.value(), scaled_qk_out, params.neg_inf.value()}, + {masked_qk_out.value()}, + "mask_select"}; + } + + // attention_probs = softmax(masked_score) = exp(masked_score - logsumexp) + logical_tensor sub_out{lt_id++, sdpa_intermediate_dtype}; + op subtract{ + op_id++, + op::kind::Subtract, + {masked_qk_out.value_or(scaled_qk_out), params.logsumexp}, + {sub_out}, + "subtract"}; + logical_tensor prob{lt_id++, sdpa_intermediate_dtype}; + op exp{op_id++, op::kind::Exp, {sub_out}, {prob}, "exp"}; + + // The following matmul doesn't support different input dtypes, insert a + // typecast + logical_tensor prob_casted = prob; + op typecast = op(op_id++, op::kind::TypeCast, "typecast"); + if (dtype != sdpa_intermediate_dtype) { + prob_casted = logical_tensor(lt_id++, dtype); + typecast.add_inputs({prob}); + typecast.add_outputs({prob_casted}); + } + + // grad_value = prob^T * grad_out + // TODO: handle GQA headnum because (batch_size, num_head_kv, seq_len_kv, + // head_dim_v) != (batch_size, num_head_q, seqlen_kv, seq_len_q) * + // (batch_size, num_head_q, seqlen_q, head_dim_v) + op matmul_grad_value{ + op_id++, + op::kind::MatMul, + {prob_casted, params.grad_out}, + {params.grad_value}, + "matmul_grad_value"}; + matmul_grad_value.set_attr(op::attr::transpose_a, true); + + // grad_prop = grad_out * value^T + // TODO: handle GQA headnum because (batch_size, num_head_q, seq_len_q, + // seq_len_kv) != (batch_size, num_head_q, seq_len_q, head_dim_v) * + // (batch_size, num_head_kv, head_dim_v, seq_len_kv) + logical_tensor grad_prop{lt_id++, sdpa_intermediate_dtype}; + op matmul_grad_prop{ + op_id++, + op::kind::MatMul, + {params.grad_out, params.value}, + {grad_prop}, + "matmul_grad_prop"}; + matmul_grad_prop.set_attr(op::attr::transpose_b, true); + + // grad_masked_score = softmaxbackward(grad_prop) + logical_tensor grad_masked_score{lt_id++, sdpa_intermediate_dtype}; + op softmax_backward{ + op_id++, + op::kind::SoftMaxBackward, + {grad_prop, prob}, + {grad_masked_score}, + "softmax_backward"}; + softmax_backward.set_attr(op::attr::axis, -1); + + // TODO: add output tensor grad_attn_mask = grad_masked_score once OneDNN + // supports output grad_attn_mask. + + // grad_scaled_score = grad_masked_score * scale + logical_tensor grad_scaled_score{lt_id++, sdpa_intermediate_dtype}; + op grad_scale_mul{ + op_id++, + op::kind::Multiply, + {grad_masked_score, params.scale}, + {grad_scaled_score}, + "grad_scale_mul"}; + + // The following matmul doesn't support different input dtypes, insert a + // typecast + logical_tensor grad_scaled_score_cast = grad_scaled_score; + op typecast2 = op(op_id++, op::kind::TypeCast, "typecast2"); + if (dtype != sdpa_intermediate_dtype) { + grad_scaled_score_cast = logical_tensor(lt_id++, dtype); + typecast2.add_inputs({grad_scaled_score}); + typecast2.add_outputs({grad_scaled_score_cast}); + } + + // grad_query = grad_scaled_score_cast * key + // TODO: handle GQA headnum because (batch_size, num_head_q, seq_len_q, + // head_dim_qk) != (batch_size, num_head_q, seq_len_q, seq_len_kv) * + // (batch_size, num_head_kv, seq_len_kv, head_dim_qk) + op matmul_grad_query{ + op_id++, + op::kind::MatMul, + {grad_scaled_score_cast, params.key}, + {params.grad_query}, + "matmul_grad_query"}; + + // grad_key = grad_scaled_score_cast^T * query + op matmul_grad_key{ + op_id++, + op::kind::MatMul, + {grad_scaled_score_cast, params.query}, + {params.grad_key}, + "matmul_grad_key"}; + matmul_grad_key.set_attr(op::attr::transpose_a, true); + + constexpr auto ekind = dnnl::engine::kind::gpu; + dnnl::graph::graph g(ekind); + g.add_op(matmul_qk); + g.add_op(scale_mul); + if (mask_add.has_value()) { + g.add_op(mask_add.value()); + } + if (is_causal) { + g.add_op(mask_gen_idx_row.value()); + g.add_op(mask_gen_idx_col.value()); + g.add_op(mask_gt.value()); + g.add_op(mask_select.value()); + } + g.add_op(subtract); + g.add_op(exp); + g.add_op(matmul_grad_value); + g.add_op(matmul_grad_prop); + g.add_op(softmax_backward); + g.add_op(grad_scale_mul); + g.add_op(matmul_grad_query); + g.add_op(matmul_grad_key); + if (dtype != sdpa_intermediate_dtype) { + g.add_op(typecast); + g.add_op(typecast2); + } + g.finalize(); + auto partitions = g.get_partitions(); + TORCH_INTERNAL_ASSERT( + (partitions.size() == 1) && partitions[0].is_supported(), + "oneDNN doesn't support this fusion pattern. If you'd like its support, please submit a issue."); + return partitions[0]; +} + +partition& find_or_create_backward_graph_partition( + bool is_causal, + const SDPABackwardLogicalParams& params) { + thread_local PartitionCache cache; + const data_type dtype = params.query.get_data_type(); + + // cache key creation + // patternID is determined on the basis of the arguments provided + std::bitset<32> patternID; + if (dtype == data_type::f32) { + patternID.set(static_cast(PartitionCache::BitType::Float32), 1); + } + if (dtype == data_type::bf16) { + patternID.set(static_cast(PartitionCache::BitType::Bfloat16), 1); + } + // sdpa backward pattern + patternID.set( + static_cast(PartitionCache::BitType::SdpaBwdPattern), 1); + + // Refer to comments in Utils.h. The first 8 bits are reserved + int pos = 8; + // attn_mask + patternID.set(pos++, params.attn_mask.has_value()); + patternID.set(pos++, is_causal); + + auto partition_ = cache.find_partition(patternID); + if (!partition_.has_value()) { + // partition cache no hit + // graph building and partitioning + partition sdpa_backward_partition = + create_sdpa_backward_graph_partition(is_causal, dtype, params); + partition_ = + cache.insert_partition_cache(patternID, sdpa_backward_partition); + } + return *partition_; +} +} // namespace sdpa_backward } // namespace namespace at::native::onednn { -void gpu_float_sdpa( +void sdpa( int batch_size, int seq_len_q, int seq_len_kv, @@ -355,7 +796,9 @@ void gpu_float_sdpa( std::optional attn_mask, bool is_causal, float softmax_scale, - const Tensor& output) { + const Tensor& attention, + bool compute_logsumexp, + const Tensor& logsumexp) { auto& eng = GpuEngineManager::Instance().get_engine(); auto& strm = GpuStreamManager::Instance().get_stream(); @@ -370,8 +813,8 @@ void gpu_float_sdpa( }; // OneDNN doesn't support fp32 ukernel for implicit causal mask, - // and the reference implementation is worse than aten math + explict causal - // mask. Fall back to explict causal mask until OneDNN v3.9 which has fp32 + // and the reference implementation is worse than aten math + explicit causal + // mask. Fall back to explicit causal mask until OneDNN v3.9 which has fp32 // ukernel for implicit causal mask. if (is_causal && query.dtype() == at::kFloat) { attn_mask = get_tril_mask(); @@ -381,32 +824,27 @@ void gpu_float_sdpa( std::vector l_inputs, l_outputs; std::optional compiled_partition; - auto get_compiled_partition = [&]() { - const SDPALogicalParams logical_params( - query, - key, - value, - attn_mask, - output, - batch_size, - seq_len_q, - seq_len_kv, - num_head_q, - num_head_kv, - head_dim_qk, - head_dim_v, - is_causal); - auto& partition_ = - find_or_create_graph_partition(is_causal, logical_params); - auto i = logical_params.get_input(); - auto o = logical_params.get_output(); - auto compiled_partition = partition_.compile(i, o, eng); - l_inputs = std::move(i); - l_outputs = std::move(o); - return compiled_partition; - }; - - compiled_partition = get_compiled_partition(); + const sdpa_forward::SDPALogicalParams logical_params( + query, + key, + value, + attn_mask, + attention, + logsumexp, + batch_size, + seq_len_q, + seq_len_kv, + num_head_q, + num_head_kv, + head_dim_qk, + head_dim_v, + is_causal, + compute_logsumexp); + auto& partition = sdpa_forward::find_or_create_graph_partition( + is_causal, compute_logsumexp, logical_params); + l_inputs = std::move(logical_params.get_input()); + l_outputs = std::move(logical_params.get_output()); + compiled_partition = partition.compile(l_inputs, l_outputs, eng); Tensor softmax_scale1 = at::full( {}, @@ -416,26 +854,147 @@ void gpu_float_sdpa( if (is_causal) { neg_inf = at::full( {}, - -INFINITY, + -std::numeric_limits::infinity(), query.options().dtype(at::toOpMathType(query.scalar_type()))); } std::vector outputs = { - {l_outputs[0], eng, output.data_ptr()}, + {l_outputs[0], eng, attention.data_ptr()}, }; + if (compute_logsumexp) { + outputs.emplace_back(l_outputs[1], eng, logsumexp.data_ptr()); + } + size_t i = 0; std::vector inputs; inputs.reserve(l_inputs.size()); - inputs.emplace_back(l_inputs[i++], eng, query.data_ptr()); - inputs.emplace_back(l_inputs[i++], eng, key.data_ptr()); - inputs.emplace_back(l_inputs[i++], eng, softmax_scale1.data_ptr()); + +#define ADD_INPUT(variable) \ + inputs.emplace_back(l_inputs[i++], eng, variable.data_ptr()) + + ADD_INPUT(query); + ADD_INPUT(key); + ADD_INPUT(softmax_scale1); if (neg_inf.has_value()) { - inputs.emplace_back(l_inputs[i++], eng, neg_inf->data_ptr()); + ADD_INPUT((*neg_inf)); } if (attn_mask.has_value()) { - inputs.emplace_back(l_inputs[i++], eng, attn_mask->data_ptr()); + ADD_INPUT((*attn_mask)); } - inputs.emplace_back(l_inputs[i++], eng, value.data_ptr()); + ADD_INPUT(value); +#undef ADD_INPUT + + compiled_partition->execute(strm, inputs, outputs); +} + +void sdpa_backward( + int batch_size, + int num_head_q, + int num_head_kv, + int seq_len_q, + int seq_len_kv, + int head_dim_qk, + int head_dim_v, + const Tensor& grad_out, + const Tensor& query, + const Tensor& key, + const Tensor& value, + const Tensor& out, + const Tensor& logsumexp, + std::optional attn_mask, + bool is_causal, + double scale, + Tensor& grad_query, + Tensor& grad_key, + Tensor& grad_value) { + auto& eng = GpuEngineManager::Instance().get_engine(); + auto& strm = GpuStreamManager::Instance().get_stream(); + + const auto get_tril_mask = [&]() { + auto opts = query.options(); + auto bool_tril = + at::ones_symint({seq_len_q, seq_len_kv}, opts.dtype(at::kBool)).tril(); + return at::where( + bool_tril, + 0.f, + at::scalar_tensor(-std::numeric_limits::infinity(), opts)); + }; + + // OneDNN doesn't support fp32 ukernel for implicit causal mask, + // and the reference implementation is worse than aten math + explicit causal + // mask. Fall back to explicit causal mask until OneDNN v3.9 which has fp32 + // ukernel for implicit causal mask. + if (is_causal && query.dtype() == at::kFloat) { + attn_mask = get_tril_mask(); + is_causal = false; + } + + std::vector l_inputs, l_outputs; + std::optional compiled_partition; + + const sdpa_backward::SDPABackwardLogicalParams logical_params( + grad_out, + query, + key, + value, + out, + logsumexp, + attn_mask, + grad_query, + grad_key, + grad_value, + batch_size, + num_head_q, + num_head_kv, + seq_len_q, + seq_len_kv, + head_dim_qk, + head_dim_v, + is_causal); + auto& partition = sdpa_backward::find_or_create_backward_graph_partition( + is_causal, logical_params); + l_inputs = std::move(logical_params.get_input()); + l_outputs = std::move(logical_params.get_output()); + compiled_partition = partition.compile(l_inputs, l_outputs, eng); + + Tensor softmax_scale = at::full( + {}, scale, query.options().dtype(at::toOpMathType(query.scalar_type()))); + std::optional neg_inf; + if (is_causal) { + neg_inf = at::full( + {}, + -std::numeric_limits::infinity(), + query.options().dtype(at::toOpMathType(query.scalar_type()))); + } + + std::vector outputs = { + {l_outputs[0], eng, grad_query.data_ptr()}, + {l_outputs[1], eng, grad_key.data_ptr()}, + {l_outputs[2], eng, grad_value.data_ptr()}, + }; + + size_t i = 0; + std::vector inputs; + inputs.reserve(l_inputs.size()); + +#define ADD_INPUT(variable) \ + inputs.emplace_back(l_inputs[i++], eng, variable.data_ptr()) + + ADD_INPUT(grad_out); + ADD_INPUT(query); + ADD_INPUT(key); + ADD_INPUT(value); + ADD_INPUT(out); + ADD_INPUT(logsumexp); + ADD_INPUT(softmax_scale); + if (neg_inf.has_value()) { + ADD_INPUT((*neg_inf)); + } + if (attn_mask.has_value()) { + ADD_INPUT((*attn_mask)); + } +#undef ADD_INPUT + compiled_partition->execute(strm, inputs, outputs); } } // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h index ac8645d3e4a50..52f89bc1395d7 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h @@ -110,11 +110,21 @@ struct PartitionCache { // bit 1: is uint8 // bit 2: fp16(0) / bf16(1) // bit 3: is fp32 - // bit 4: is sdp pattern - // bit 5-7: N/A + // bit 4: is sdpa pattern + // bit 5: is sdpa backward pattern + // bit 6-7: reserved for future use // The rest of the bits depend upon the arguments provided // However, down the line, we might have different bitsets for different // patterns + enum class BitType : uint8_t { + Int8 = 0, + Uint8 = 1, + Bfloat16 = 2, + Float32 = 3, + SdpaPattern = 4, + SdpaBwdPattern = 5 + }; + dnnl::graph::partition& insert_partition_cache( std::bitset<32>& patternID, dnnl::graph::partition& p) { diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h index e73cb73e8b1e7..6b2bf01e6d73d 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h +++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h @@ -164,7 +164,7 @@ void quantized_matmul( std::string_view unary_post_op_algorithm, bool m2_trnas); -void gpu_float_sdpa( +void sdpa( int batch_size, int seq_len_q, int seq_len_kv, @@ -178,5 +178,28 @@ void gpu_float_sdpa( std::optional attn_mask, bool is_causal, float softmax_scale, - const Tensor& output); + const Tensor& attention, + bool compute_logsumexp, + const Tensor& logsumexp); + +void sdpa_backward( + int batch_size, + int num_head_q, + int num_head_kv, + int seq_len_q, + int seq_len_kv, + int head_dim_qk, + int head_dim_v, + const Tensor& grad_out, + const Tensor& query, + const Tensor& key, + const Tensor& value, + const Tensor& out, + const Tensor& logsumexp, + std::optional attn_mask, + bool is_causal, + double scale, + Tensor& grad_query, + Tensor& grad_key, + Tensor& grad_value); } // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp index 1c6e2a6c89dae..c014313a5b35d 100644 --- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp @@ -1,5 +1,7 @@ #include #include +#include + #include #include #include @@ -7,7 +9,7 @@ using namespace at::native::onednn; namespace at::native::xpu { -static inline c10::ScalarType qconv_decide_out_dtype( +inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype( const at::Tensor& act, const std::optional output_dtype) { bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat); @@ -19,7 +21,7 @@ static inline c10::ScalarType qconv_decide_out_dtype( return dst_dtype; } -static at::Tensor qconv_prepack_xpu( +at::Tensor QConvoneDNNXPU::qconv_prepack_xpu( at::Tensor weight, at::Tensor weight_scales, double input_scale, @@ -33,222 +35,265 @@ static at::Tensor qconv_prepack_xpu( return weight; } -class QConvoneDNNXPU final { - public: - static at::Tensor run_pointwise( - at::Tensor act, - double act_scale, - int64_t act_zero_point, - at::Tensor weight, - at::Tensor weight_scales, - at::Tensor weight_zero_points, - std::optional bias, - torch::List stride, - torch::List padding, - torch::List dilation, - int64_t groups, - double inv_output_scale, - int64_t output_zero_point, - std::optional output_dtype, - std::string_view attr, - torch::List> scalars, - std::optional algorithm) { - if (act.dim() == 3 || act.dim() == 5) { - TORCH_CHECK( - attr == "none", - "quantized pointwise conv", - act.dim() - 2, - "d doesn't support unary_post_op fusion. Got unary_post_op:", - attr, - "."); - } else { - TORCH_CHECK( - attr == "none" || attr == "relu" || attr == "hardtanh" || - attr == "hardswish" || attr == "swish", - "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:", - attr, - "."); - } +at::Tensor QConvoneDNNXPU::run_pointwise( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double inv_output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm) { + if (act.dim() == 3 || act.dim() == 5) { + TORCH_CHECK( + attr == "none", + "quantized pointwise conv", + act.dim() - 2, + "d doesn't support unary_post_op fusion. Got unary_post_op:", + attr, + "."); + } else { + TORCH_CHECK( + attr == "none" || attr == "relu" || attr == "hardtanh" || + attr == "hardswish" || attr == "swish", + "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:", + attr, + "."); + } - bool is_channels_last_suggested = use_channels_last_for_conv(act, weight); - auto mfmt = is_channels_last_suggested - ? get_cl_tag_by_ndim(act.ndimension()) - : at::MemoryFormat::Contiguous; - Tensor input_ = act.contiguous(mfmt); - Tensor weight_ = weight.contiguous(mfmt); + bool is_channels_last_suggested = use_channels_last_for_conv(act, weight); + auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension()) + : at::MemoryFormat::Contiguous; + Tensor input_ = act.contiguous(mfmt); + Tensor weight_ = weight.contiguous(mfmt); - auto dst_tz = conv_dst_size( - input_.ndimension(), - input_.sizes(), - weight_.sizes(), - padding.vec(), - padding.vec(), - stride.vec(), - dilation.vec()); + auto dst_tz = conv_dst_size( + input_.ndimension(), + input_.sizes(), + weight_.sizes(), + padding.vec(), + padding.vec(), + stride.vec(), + dilation.vec()); - auto dst_dtype = qconv_decide_out_dtype(act, output_dtype); - Tensor output = - at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt)); + auto dst_dtype = qconv_decide_out_dtype(act, output_dtype); + Tensor output = + at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt)); - return quantized_convolution( - act, - act_scale, - act_zero_point, - weight, - weight_scales, - weight_zero_points, - bias, - stride, - padding, - dilation, - /*transposed*/ false, - groups, - output, - inv_output_scale, - output_zero_point, - /*accum*/ std::nullopt, - /*accum_scale*/ 0.0, - /*accum_zero_point*/ 0, - /*output_dtype*/ output_dtype, - /*binary_attr*/ std::nullopt, - /*binary_alpha*/ std::nullopt, - /*unary_attr*/ attr, - /*unary_scalars*/ scalars, - /*unary_algorithm*/ algorithm); - } + return quantized_convolution( + act, + act_scale, + act_zero_point, + weight, + weight_scales, + weight_zero_points, + bias, + stride, + padding, + dilation, + /*transposed*/ false, + groups, + output, + inv_output_scale, + output_zero_point, + /*accum*/ std::nullopt, + /*accum_scale*/ 0.0, + /*accum_zero_point*/ 0, + /*output_dtype*/ output_dtype, + /*binary_attr*/ std::nullopt, + /*binary_alpha*/ std::nullopt, + /*unary_attr*/ attr, + /*unary_scalars*/ scalars, + /*unary_algorithm*/ algorithm); +} - static at::Tensor run_pointwise_tensor( - at::Tensor act, - at::Tensor act_scale, - at::Tensor act_zero_point, - at::Tensor weight, - at::Tensor weight_scales, - at::Tensor weight_zero_points, - std::optional bias, - torch::List stride, - torch::List padding, - torch::List dilation, - int64_t groups, - double output_scale, - int64_t output_zero_point, - std::optional output_dtype, - std::string_view attr, - torch::List> scalars, - std::optional algorithm) { - return run_pointwise( - act, - act_scale.item().toDouble(), - act_zero_point.item().toLong(), - weight, - weight_scales, - weight_zero_points, - bias, - stride, - padding, - dilation, - groups, - output_scale, - output_zero_point, - output_dtype, - /*unary_attr*/ attr, - /*unary_scalars*/ scalars, - /*unary_algorithm*/ algorithm); - } +at::Tensor QConvoneDNNXPU::run_pointwise_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm) { + return run_pointwise( + act, + act_scale.item().toDouble(), + act_zero_point.item().toLong(), + weight, + weight_scales, + weight_zero_points, + bias, + stride, + padding, + dilation, + groups, + output_scale, + output_zero_point, + output_dtype, + /*unary_attr*/ attr, + /*unary_scalars*/ scalars, + /*unary_algorithm*/ algorithm); +} - static at::Tensor run_pointwise_binary( - at::Tensor act, - double act_scale, - int64_t act_zero_point, - at::Tensor weight, - at::Tensor weight_scales, - at::Tensor weight_zero_points, - at::Tensor accum, - std::optional bias, - torch::List stride, - torch::List padding, - torch::List dilation, - int64_t groups, - double output_scale, - int64_t output_zero_point, - std::optional output_dtype, - double accum_scale, - int64_t accum_zero_point, - std::string_view binary_attr, - std::optional alpha, - std::optional unary_attr, - torch::List> unary_scalars, - std::optional unary_algorithm) { - TORCH_CHECK( - act.dim() == 4 && binary_attr == "sum" && - (!unary_attr.has_value() || - (unary_attr.has_value() && - (unary_attr.value() == "none" || unary_attr.value() == "relu"))), - "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ", - binary_attr, - " unary_post_op: ", - unary_attr.has_value() ? unary_attr.value() : "none", - ".") +at::Tensor QConvoneDNNXPU::run_pointwise_binary( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm) { + TORCH_CHECK( + act.dim() == 4 && binary_attr == "sum" && + (!unary_attr.has_value() || + (unary_attr.has_value() && + (unary_attr.value() == "none" || unary_attr.value() == "relu"))), + "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ", + binary_attr, + " unary_post_op: ", + unary_attr.has_value() ? unary_attr.value() : "none", + ".") - bool is_channels_last_suggested = use_channels_last_for_conv(act, weight); - auto mfmt = is_channels_last_suggested - ? get_cl_tag_by_ndim(act.ndimension()) - : at::MemoryFormat::Contiguous; - Tensor input_ = act.contiguous(mfmt); - Tensor weight_ = weight.contiguous(mfmt); + bool is_channels_last_suggested = use_channels_last_for_conv(act, weight); + auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension()) + : at::MemoryFormat::Contiguous; + Tensor input_ = act.contiguous(mfmt); + Tensor weight_ = weight.contiguous(mfmt); - auto dst_tz = conv_dst_size( - input_.ndimension(), - input_.sizes(), - weight_.sizes(), - padding.vec(), - padding.vec(), - stride.vec(), - dilation.vec()); + auto dst_tz = conv_dst_size( + input_.ndimension(), + input_.sizes(), + weight_.sizes(), + padding.vec(), + padding.vec(), + stride.vec(), + dilation.vec()); - auto dst_dtype = qconv_decide_out_dtype(act, output_dtype); - bool has_accum_postop_sum = binary_attr == "sum"; - Tensor output = has_accum_postop_sum - ? accum - : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt)); + auto dst_dtype = qconv_decide_out_dtype(act, output_dtype); + bool has_accum_postop_sum = binary_attr == "sum"; + Tensor output = has_accum_postop_sum + ? accum + : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt)); - output = quantized_convolution( - act, - act_scale, - act_zero_point, - weight, - weight_scales, - weight_zero_points, - bias, - stride, - padding, - dilation, - /*transposed*/ false, - groups, - output, - output_scale, - output_zero_point, - /*accum*/ accum, - /*accum_scale*/ accum_scale, - /*accum_zero_point*/ accum_zero_point, - /*output_dtype*/ output_dtype, - /*binary_attr*/ binary_attr, - /*binary_alpha*/ alpha, - /*unary_attr*/ unary_attr, - /*unary_scalars*/ unary_scalars, - /*unary_algorithm*/ unary_algorithm); + output = quantized_convolution( + act, + act_scale, + act_zero_point, + weight, + weight_scales, + weight_zero_points, + bias, + stride, + padding, + dilation, + /*transposed*/ false, + groups, + output, + output_scale, + output_zero_point, + /*accum*/ accum, + /*accum_scale*/ accum_scale, + /*accum_zero_point*/ accum_zero_point, + /*output_dtype*/ output_dtype, + /*binary_attr*/ binary_attr, + /*binary_alpha*/ alpha, + /*unary_attr*/ unary_attr, + /*unary_scalars*/ unary_scalars, + /*unary_algorithm*/ unary_algorithm); - if (!has_accum_postop_sum) { - return output; - } else { - return accum; - } + if (!has_accum_postop_sum) { + return output; + } else { + return accum; } -}; +} + +at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor( + at::Tensor act, // contains quantized values but not QTensor + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, // contains quantized values but not QTensor + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm) { + return run_pointwise_binary( + act, + act_scale.item().toDouble(), + act_zero_point.item().toLong(), + weight, + weight_scales, + weight_zero_points, + accum, + bias, + stride, + padding, + dilation, + groups, + output_scale, + output_zero_point, + output_dtype, + accum_scale, + accum_zero_point, + binary_attr, + alpha, + unary_attr, + unary_scalars, + unary_algorithm); +} TORCH_LIBRARY_IMPL(onednn, XPU, m) { m.impl( TORCH_SELECTIVE_NAME("onednn::qconv_prepack"), - TORCH_FN(xpu::qconv_prepack_xpu)); + TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu)); m.impl( TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"), QConvoneDNNXPU::run_pointwise); @@ -267,6 +312,9 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) { m.impl( TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"), QConvoneDNNXPU::run_pointwise_tensor); + m.impl( + TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"), + QConvoneDNNXPU::run_pointwise_binary_tensor); } } // namespace at::native::xpu diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.h b/aten/src/ATen/native/mkldnn/xpu/qconv.h new file mode 100644 index 0000000000000..e9ddd4fa29697 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/qconv.h @@ -0,0 +1,111 @@ +#pragma once + +#include +#include + +namespace at::native::xpu { +class QConvoneDNNXPU final { + public: + C10_API static at::Tensor run_pointwise( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double inv_output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_binary( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + C10_API static at::Tensor run_pointwise_binary_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + static inline c10::ScalarType qconv_decide_out_dtype( + const at::Tensor& act, + const std::optional output_dtype); + + static at::Tensor qconv_prepack_xpu( + at::Tensor weight, + at::Tensor weight_scales, + double input_scale, + int64_t input_zero_point, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + std::optional> input_shape); +}; + +} // namespace at::native::xpu \ No newline at end of file diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp index 7e3f2f01fa1e6..e9584e8289eb2 100644 --- a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp @@ -1,13 +1,14 @@ #include #include +#include #include using namespace at::native::onednn; namespace at::native::xpu { -static inline c10::ScalarType qlinear_decide_out_dtype( +inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype( const at::Tensor& act, const std::optional output_dtype) { bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat); @@ -19,7 +20,7 @@ static inline c10::ScalarType qlinear_decide_out_dtype( return dst_dtype; } -static Tensor q_linear_pointwise( +Tensor QLinearOnednnXPU::q_linear_pointwise( Tensor act, double act_scale, int64_t act_zero_point, @@ -78,7 +79,7 @@ static Tensor q_linear_pointwise( return qout; } -static Tensor q_linear_pointwise_tensor( +Tensor QLinearOnednnXPU::q_linear_pointwise_tensor( Tensor act, Tensor act_scale, Tensor act_zero_point, @@ -137,7 +138,7 @@ static Tensor q_linear_pointwise_tensor( return qout; } -static Tensor q_linear_pointwise_binary( +Tensor QLinearOnednnXPU::q_linear_pointwise_binary( Tensor act, double act_scale, int64_t act_zero_point, @@ -208,7 +209,7 @@ static Tensor q_linear_pointwise_binary( return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout; } -static Tensor q_linear_pointwise_binary_tensor( +Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor( Tensor act, Tensor act_scale, Tensor act_zero_point, @@ -248,7 +249,7 @@ static Tensor q_linear_pointwise_binary_tensor( unary_post_op_algorithm); } -static at::Tensor q_linear_prepack_onednn( +Tensor QLinearOnednnXPU::q_linear_prepack_onednn( at::Tensor weight, std::optional> input_shape) { at::Tensor weight_transposed = weight.transpose(0, 1); @@ -258,19 +259,19 @@ static at::Tensor q_linear_prepack_onednn( TORCH_LIBRARY_IMPL(onednn, XPU, m) { m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"), - TORCH_FN(q_linear_pointwise)); + TORCH_FN(QLinearOnednnXPU::q_linear_pointwise)); m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"), - TORCH_FN(q_linear_pointwise_tensor)); + TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_tensor)); m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"), - TORCH_FN(q_linear_prepack_onednn)); + TORCH_FN(QLinearOnednnXPU::q_linear_prepack_onednn)); m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"), - TORCH_FN(q_linear_pointwise_binary)); + TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary)); m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"), - TORCH_FN(q_linear_pointwise_binary_tensor)); + TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor)); } } // namespace at::native::xpu diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.h b/aten/src/ATen/native/mkldnn/xpu/qlinear.h new file mode 100644 index 0000000000000..7382276664242 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.h @@ -0,0 +1,91 @@ +#pragma once + +#include +#include +#include + +namespace at::native::xpu { + +class QLinearOnednnXPU final { + public: + C10_API static Tensor q_linear_pointwise( + Tensor act, + double act_scale, + int64_t act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + torch::List> post_op_args, + std::string_view post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_tensor( + Tensor act, + Tensor act_scale, + Tensor act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + torch::List> post_op_args, + std::string_view post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_binary( + Tensor act, + double act_scale, + int64_t act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, + double binary_alpha, + std::string_view unary_post_op, + torch::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_binary_tensor( + Tensor act, + Tensor act_scale, + Tensor act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, + double binary_alpha, + std::string_view unary_post_op, + torch::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); + + C10_API static Tensor q_linear_prepack_onednn( + at::Tensor weight, + std::optional> input_shape); + + static inline c10::ScalarType qlinear_decide_out_dtype( + const at::Tensor& act, + const std::optional output_dtype); + +}; // class QLinearOnednnXPU + +} // namespace at::native::xpu diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h index e6f87f5499a47..f9cd28ca06fa8 100644 --- a/aten/src/ATen/native/mps/OperationUtils.h +++ b/aten/src/ATen/native/mps/OperationUtils.h @@ -88,14 +88,8 @@ std::string getArrayRefString(const IntArrayRef s); // use has_storage() on the returned tensor to determine if src actually is a view Tensor gatherViewTensor(const Tensor& src, Tensor& dst); Tensor& scatterViewTensor(const Tensor& src, Tensor& output); -MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, - MPSGraphTensor* inputTensor, - const TensorBase& input, - bool includesInt64 = false); -MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, - MPSGraphTensor* inputTensor, - const TensorBase& input, - bool includesInt64 = false); +MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input); +MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input); MPSNDArray* getStridedMPSNDArray(const TensorBase& src, MPSNDArray* srcNDArray); MPSNDArray* getMPSNDArray(const TensorBase& t, const IntArrayRef& sizes = {}, const IntArrayRef& strides = {}); @@ -435,14 +429,6 @@ inline T* LookUpOrCreateCachedGraph(const std::string& key, std::functionexecuteMPSGraph(mpsGraph, feeds, results, SyncType::COMMIT_ADAPTIVE); } -static inline void checkSupportsComplex() { - TORCH_CHECK_TYPE(supportsComplex(), "MPS complex types are only supported on MacOS 14.0 or newer."); -} - MPSDataType getMPSDataType(ScalarType scalar_type) { switch (scalar_type) { case ScalarType::Float: @@ -100,7 +96,6 @@ MPSDataType getMPSDataType(ScalarType scalar_type) { case ScalarType::Half: return MPSDataTypeFloat16; case ScalarType::BFloat16: - checkSupportsBFloat16(); return MPSDataTypeBFloat16; case ScalarType::Int: return MPSDataTypeInt32; @@ -119,10 +114,8 @@ MPSDataType getMPSDataType(ScalarType scalar_type) { "Cannot convert a float64 Tensor to MPS as the MPS framework doesn't support float64. " "Please use float32 instead.") case ScalarType::ComplexHalf: - checkSupportsComplex(); return MPSDataTypeComplexFloat16; case ScalarType::ComplexFloat: - checkSupportsComplex(); return MPSDataTypeComplexFloat32; // Unsigned types case ScalarType::UInt64: @@ -140,16 +133,10 @@ MPSDataType getMPSDataType(ScalarType scalar_type) { // #issue 104398441 sortWithTensor and argsortWithTensor has support of // Int32, Half and Float32 types. These utilities are to help cast to these // types. -MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, - MPSGraphTensor* inputTensor, - const TensorBase& input, - bool includesInt64) { +MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input) { MPSDataType dataType = getMPSDataType(input.scalar_type()); - bool condition = - (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) && (dataType != MPSDataTypeFloat16); - if (includesInt64) { - condition = condition && (dataType != MPSDataTypeInt64); - } + bool condition = (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) && + (dataType != MPSDataTypeFloat16) && (dataType != MPSDataTypeInt64); if (condition) { dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32; return [mpsGraph castTensor:inputTensor toType:dataType name:@"castInputTensor"]; @@ -160,16 +147,10 @@ MPSDataType getMPSDataType(ScalarType scalar_type) { // #issue 104398441 sortWithTensor and argsortWithTensor has support of // Int32, Half and Float32 types. These utilities are to help cast from these // types. -MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, - MPSGraphTensor* inputTensor, - const TensorBase& input, - bool includesInt64) { +MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input) { MPSDataType dataType = getMPSDataType(input.scalar_type()); - bool condition = - (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) && (dataType != MPSDataTypeFloat16); - if (includesInt64) { - condition = condition && (dataType != MPSDataTypeInt64); - } + bool condition = (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) && + (dataType != MPSDataTypeFloat16) && (dataType != MPSDataTypeInt64); if (condition) { inputTensor = [mpsGraph castTensor:inputTensor toType:dataType name:@"castInputTensor"]; } @@ -186,7 +167,6 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { case ScalarType::Half: return MPSDataTypeFloat16; case ScalarType::BFloat16: - checkSupportsBFloat16(); return MPSDataTypeBFloat16; case ScalarType::Int: return MPSDataTypeInt32; @@ -201,13 +181,11 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { case ScalarType::Bool: return MPSDataTypeBool; case ScalarType::ComplexHalf: - checkSupportsComplex(); return MPSDataTypeComplexFloat16; // This is an intentional fallthrough supporting ComplexDouble for Scalar // types as they are casted to Complex64 currently. case ScalarType::ComplexDouble: case ScalarType::ComplexFloat: - checkSupportsComplex(); return MPSDataTypeComplexFloat32; // Unsigned types case ScalarType::UInt64: @@ -267,7 +245,6 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { case ScalarType::Half: return "half"; case ScalarType::BFloat16: - checkSupportsBFloat16(); return "bfloat"; case ScalarType::Int: return "int"; @@ -879,9 +856,7 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {} MTLCompileOptions* options = compile_options; if (!options) { options = [[MTLCompileOptions new] autorelease]; - // Need 3.0 for atomic oprations, 3.1 introduces bfloat support - [options setLanguageVersion:is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? MTLLanguageVersion3_1 - : MTLLanguageVersion3_0]; + [options setLanguageVersion:MTLLanguageVersion3_1]; if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) { options.mathMode = fast_math ? MTLMathModeFast : MTLMathModeSafe; options.mathFloatingPointFunctions = diff --git a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal index f6f4935608e49..0539eab79500d 100644 --- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal +++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal @@ -39,6 +39,13 @@ struct lerp_alpha_functor { } }; +struct native_dropout_mask_and_scale_functor { + template + inline TA operator()(const TI a, const TI b, const TA scale) { + return static_cast(a) * static_cast(b) * scale; + } +}; + struct fmax_functor { template inline T operator()(const T a, const T b) { @@ -315,6 +322,20 @@ struct fmod_functor { } }; +struct igamma_functor { + template + inline T operator()(const T a, const T b) { + return c10::metal::igamma(a, b); + } +}; + +struct igammac_functor { + template + inline T operator()(const T a, const T b) { + return c10::metal::igammac(a, b); + } +}; + #define REGISTER_INTEGER_BINARY_OP(NAME) \ REGISTER_BINARY_OP(NAME, long, long); \ REGISTER_BINARY_OP(NAME, int, int); \ @@ -386,6 +407,8 @@ REGISTER_OPMATH_FLOAT_BINARY_OP(remainder); REGISTER_INTEGER_BINARY_OP(remainder); REGISTER_OPMATH_FLOAT_BINARY_OP(fmod); REGISTER_INTEGER_BINARY_OP(fmod); +REGISTER_OPMATH_FLOAT_BINARY_OP(igamma); +REGISTER_OPMATH_FLOAT_BINARY_OP(igammac); REGISTER_BINARY_ALPHA_OP(add_alpha, long, long, long); REGISTER_BINARY_ALPHA_OP(add_alpha, int, int, int); REGISTER_BINARY_ALPHA_OP(add_alpha, float, float, float); @@ -411,6 +434,10 @@ REGISTER_BINARY_ALPHA_OP(lerp_alpha, uchar, uchar, uchar); REGISTER_BINARY_ALPHA_OP(lerp_alpha, char, char, char); REGISTER_BINARY_ALPHA_OP(lerp_alpha, bool, bool, bool); +REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, float, float, float); +REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, bfloat, bfloat, bfloat); +REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, half, half, half); + REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat); REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat); REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat); diff --git a/aten/src/ATen/native/mps/kernels/GridSampler.h b/aten/src/ATen/native/mps/kernels/GridSampler.h new file mode 100644 index 0000000000000..c2b3cad3cd47d --- /dev/null +++ b/aten/src/ATen/native/mps/kernels/GridSampler.h @@ -0,0 +1,25 @@ +#pragma once +#include + +#ifdef __METAL__ +enum class GridSamplerInterpolation { Bilinear, Nearest, Bicubic }; +enum class GridSamplerPadding { Zeros, Border, Reflection }; +#else +#include +using at::native::GridSamplerInterpolation; +using at::native::GridSamplerPadding; +#endif + +template +struct GridSamplerParams { + int32_t sampler_dims; + ::c10::metal::array output_sizes; + ::c10::metal::array output_strides; + ::c10::metal::array input_sizes; + ::c10::metal::array input_strides; + ::c10::metal::array grid_sizes; + ::c10::metal::array grid_strides; + GridSamplerInterpolation interpolation_mode; + GridSamplerPadding padding_mode; + bool align_corners; +}; diff --git a/aten/src/ATen/native/mps/kernels/GridSampler.metal b/aten/src/ATen/native/mps/kernels/GridSampler.metal new file mode 100644 index 0000000000000..331793e08d664 --- /dev/null +++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal @@ -0,0 +1,324 @@ +#include +#include +#include +#include + +using namespace metal; +using namespace c10::metal; + +struct GridSamplerOffsets { + int32_t output; + int32_t input; + int32_t grid; + + GridSamplerOffsets() : output(0), input(0), grid(0) {} +}; + +// Find offsets into the tensors that this thread will operate on, +// based on the thread ID. +static GridSamplerOffsets find_grid_sampler_offsets( + constant int32_t* output_sizes, + constant int32_t* output_strides, + constant int32_t* input_strides, + constant int32_t* grid_strides, + int32_t sampler_dims, + uint tid) { + auto dims = sampler_dims + 2; + auto output_idx = static_cast(tid); + GridSamplerOffsets offsets; + + for (auto dim = dims - 1; dim >= 0; dim--) { + auto dim_idx = output_idx % output_sizes[dim]; + output_idx = output_idx / output_sizes[dim]; + + // Select the output element that this thread will calculate. + // output shape: + // 2 sampler dims: (N, C, Hout, Wout) + // 3 sampler dims: (N, C, Dout, Hout, Wout) + offsets.output += output_strides[dim] * dim_idx; + + // Select the batch and channel for the input. + // input shape: + // 2 sampler dims: (N, C, Hin, Win) + // 3 sampler dims: (N, C, Din, Hin, Win) + if (dim < 2) { + offsets.input += input_strides[dim] * dim_idx; + } + + // Select the grid coordinates for the output element. + // grid shape: + // 2 sampler dims: (N, Hout, Wout, 2) + // 3 sampler dims: (N, Dout, Hout, Wout, 3) + if (dim == 0) { + offsets.grid += grid_strides[dim] * dim_idx; + } else if (dim >= 2) { + offsets.grid += grid_strides[dim - 1] * dim_idx; + } + } + + return offsets; +} + +// Mod function which gives postive output when `a` is negative +static int32_t mod(int32_t a, int32_t b) { + auto r = a % b; + return r + (r < 0 ? b : 0); +} + +// Sentinel index value to indicate zero padding +constant int32_t IDX_ZERO = -1; + +// Apply padding to an index into the input +static int32_t pad_input_index( + int32_t idx, + int32_t input_size, + GridSamplerPadding padding_mode, + bool align_corners) { + int32_t idx_padded = idx; + + if (padding_mode == GridSamplerPadding::Zeros) { + idx_padded = (idx < 0) ? IDX_ZERO : idx_padded; + idx_padded = (idx >= input_size) ? IDX_ZERO : idx_padded; + + } else if (padding_mode == GridSamplerPadding::Border) { + idx_padded = (idx < 0) ? 0 : idx_padded; + idx_padded = (idx >= input_size) ? input_size - 1 : idx_padded; + + } else if (padding_mode == GridSamplerPadding::Reflection) { + auto scale_length = align_corners ? (input_size - 1) : input_size; + auto idx_mod = mod(idx, scale_length); + auto idx_mod_reverse = (input_size - 1) - idx_mod; + bool is_reverse = (abs(idx - idx_mod) / scale_length) % 2 == 1; + idx_padded = is_reverse ? idx_mod_reverse : idx_mod; + } + return idx_padded; +} + +template +T get_tensor_val( + constant T* input, + constant int32_t* input_strides, + int32_t indices[dims]) { + bool found_idx_zero = false; + int32_t offset = 0; + + for (auto dim = 0; dim < dims; dim++) { + auto idx = indices[dim]; + found_idx_zero = found_idx_zero || (idx == IDX_ZERO); + offset += (found_idx_zero ? 0 : idx) * input_strides[dim]; + } + + return found_idx_zero ? 0 : input[offset]; +} + +// This function performs 3D linear interpolation for one value. One way to +// think of how this works is to imagine a unit cube where each corner of the +// cube has one scalar value associated with it. Inside the cube, the values +// change linearly, so the gradient is constant. The values associated with each +// corner are given by the `input`, indexed at all eight different combinations +// of the `left_indices` and `right_indices`. Given a 3D coordinate anywhere +// within the cube, specified by the `scales` argument, we must calculate the +// value associated with that position. +template +T interpolate_linear_3d( + constant T* input, + constant int32_t* input_strides, + int32_t left_indices[3], + int32_t right_indices[3], + opmath_t scales[3]) { + int32_t a_idx[3] = {left_indices[0], left_indices[1], left_indices[2]}; + int32_t b_idx[3] = {left_indices[0], left_indices[1], right_indices[2]}; + int32_t c_idx[3] = {left_indices[0], right_indices[1], left_indices[2]}; + int32_t d_idx[3] = {left_indices[0], right_indices[1], right_indices[2]}; + int32_t e_idx[3] = {right_indices[0], left_indices[1], left_indices[2]}; + int32_t f_idx[3] = {right_indices[0], left_indices[1], right_indices[2]}; + int32_t g_idx[3] = {right_indices[0], right_indices[1], left_indices[2]}; + int32_t h_idx[3] = {right_indices[0], right_indices[1], right_indices[2]}; + auto a = + static_cast>(get_tensor_val<3>(input, input_strides, a_idx)); + auto b = + static_cast>(get_tensor_val<3>(input, input_strides, b_idx)); + auto c = + static_cast>(get_tensor_val<3>(input, input_strides, c_idx)); + auto d = + static_cast>(get_tensor_val<3>(input, input_strides, d_idx)); + auto e = + static_cast>(get_tensor_val<3>(input, input_strides, e_idx)); + auto f = + static_cast>(get_tensor_val<3>(input, input_strides, f_idx)); + auto g = + static_cast>(get_tensor_val<3>(input, input_strides, g_idx)); + auto h = + static_cast>(get_tensor_val<3>(input, input_strides, h_idx)); + + auto scale0_right = scales[0]; + auto scale1_right = scales[1]; + auto scale2_right = scales[2]; + auto scale0_left = 1 - scale0_right; + auto scale1_left = 1 - scale1_right; + auto scale2_left = 1 - scale2_right; + + return static_cast( + scale0_left * scale1_left * scale2_left * a + + scale0_left * scale1_left * scale2_right * b + + scale0_left * scale1_right * scale2_left * c + + scale0_left * scale1_right * scale2_right * d + + scale0_right * scale1_left * scale2_left * e + + scale0_right * scale1_left * scale2_right * f + + scale0_right * scale1_right * scale2_left * g + + scale0_right * scale1_right * scale2_right * h); +} + +// Calculates a single output element. +// `input` shape: +// 2 sampler dims: (Hin, Win) +// 3 sampler dims: (Din, Hin, Win) +// `coords` values: +// 2 sampler dims: (Wcoord, Hcoord) +// 3 sampler dims: (Wcoord, Hcoord, Dcoord) +template +void grid_sampler_single_element( + device T* output, + constant T* input, + constant T* coords, + int32_t dims, + constant int32_t* input_sizes, + constant int32_t* input_strides, + GridSamplerInterpolation interpolation_mode, + GridSamplerPadding padding_mode, + bool align_corners) { + int32_t left_indices[3]; + int32_t right_indices[3]; + opmath_t scales[3]; + + // For each dimension, find the pair of indices in the cooresponding dimension + // of `input` which surround the grid coordinate in that dimension. We'll do + // this by mapping different coordiante spaces onto each other. There are + // basically three different coordinate spaces to keep in mind: + // + // * aligned grid space + // - `-1` refers to the leftmost input value. + // - `1` refers to the rightmost input value. + // + // * unaligned grid space + // - `-1` refers to the midpoint between the leftmost input value and + // a padding value to the left of that. + // - `1` refers to the midpoint between the rightmost input value and + // a padding value to the right of that. + // + // * input index space + // - `n` refers to the n-th value of the input. + // - `0` refers to the leftmost input value. + // - `N-1` refers to the rightmost input value. + // + // If `align_corners == False`, then the coordinates are is in unaligned grid + // space, and we will map it onto aligned grid space. If `align_corners == + // True`, then coordinates are already in aligned grid space. + // + // Then we will map unaligned grid space onto input index space, making it + // relatively simple to find the two input indices that surround the + // coordinate. + for (auto coord_dim = 0; coord_dim < dims; coord_dim++) { + auto input_dim = dims - coord_dim - 1; + auto input_size = input_sizes[input_dim]; + auto coord = static_cast>(coords[coord_dim]); + + // Interpret nan as -1 + coord = isnan(coord) ? -1 : coord; + + if (!align_corners) { + // Map unaligned grid space to aligned grid space + auto corner_alignment_factor = static_cast>(input_size) / + static_cast>(input_size - 1); + coord = coord * corner_alignment_factor; + } + + // Map aligned grid space to input index space + coord = (coord + 1) * (static_cast>(input_size - 1) / 2); + + // Get the input indices surrounding the coordinate, apply padding to them, + // and obtain the scaling factor between the two for interpolation. + auto left_idx = static_cast(floor(coord)); + auto right_idx = static_cast(ceil(coord)); + left_indices[input_dim] = + pad_input_index(left_idx, input_size, padding_mode, align_corners); + right_indices[input_dim] = + pad_input_index(right_idx, input_size, padding_mode, align_corners); + + auto scale = coord - left_idx; + + if (interpolation_mode == GridSamplerInterpolation::Nearest) { + // TODO: For some reason, rounding the scale to 0 or 1 and then using + // linear interpolation seems to work perfectly with zero padding mode, + // but we get flaky failures with border and reflection padding modes. + // Need to investigate and fix it. + scale = (scale <= 0.5) ? 0 : 1; + } + scales[input_dim] = scale; + } + + // Now that we have the bounding indices and scale factor for each dimension + // of the input, we can interpolate. + if (dims == 3) { + *output = interpolate_linear_3d( + input, input_strides, left_indices, right_indices, scales); + } +} + +template +kernel void grid_sampler( + device T* output [[buffer(0)]], + constant T* input [[buffer(1)]], + constant T* grid [[buffer(2)]], + constant GridSamplerParams<5>& params [[buffer(3)]], + uint tid [[thread_position_in_grid]]) { + auto output_sizes = params.output_sizes.data(); + auto output_strides = params.output_strides.data(); + auto input_sizes = params.input_sizes.data(); + auto input_strides = params.input_strides.data(); + auto grid_strides = params.grid_strides.data(); + auto sampler_dims = params.sampler_dims; + + auto offsets = find_grid_sampler_offsets( + output_sizes, + output_strides, + input_strides, + grid_strides, + sampler_dims, + tid); + + output += offsets.output; + input += offsets.input; + auto coords = grid + offsets.grid; + + input_sizes += 2; + input_strides += 2; + + auto interpolation_mode = params.interpolation_mode; + auto padding_mode = params.padding_mode; + auto align_corners = params.align_corners; + + grid_sampler_single_element( + output, + input, + coords, + sampler_dims, + input_sizes, + input_strides, + interpolation_mode, + padding_mode, + align_corners); +} + +#define REGISTER_GRID_SAMPLER_OP(DTYPE) \ + template [[host_name("grid_sampler_" #DTYPE)]] \ + kernel void grid_sampler( \ + device DTYPE * output [[buffer(0)]], \ + constant DTYPE * input [[buffer(1)]], \ + constant DTYPE * grid [[buffer(2)]], \ + constant GridSamplerParams<5> & params [[buffer(3)]], \ + uint tid [[thread_position_in_grid]]); + +REGISTER_GRID_SAMPLER_OP(float); +REGISTER_GRID_SAMPLER_OP(half); +REGISTER_GRID_SAMPLER_OP(bfloat); diff --git a/aten/src/ATen/native/mps/kernels/Indexing.metal b/aten/src/ATen/native/mps/kernels/Indexing.metal index 7503d8b2b1c8b..b41e64d70ced5 100644 --- a/aten/src/ATen/native/mps/kernels/Indexing.metal +++ b/aten/src/ATen/native/mps/kernels/Indexing.metal @@ -5,29 +5,6 @@ using namespace metal; using namespace c10::metal; -namespace c10 { -namespace metal { -// There are no atomic 64-bit add in Metal yet, but this implements a consistent -// add I.e. if multiple threads are modify the same 64-bit value, results stored -// at the address will eventually be equal to its original value plus sum of all -// operands -template <> -struct AtomicType { - using type = ::metal::atomic; - static inline void atomic_add(device type* data, long offset, long value) { - const auto value_bits = as_type(value); - const uint low = static_cast(value_bits); - uint high = static_cast(value_bits >> 32); - auto ptr = data + (offset << 1); - auto old_low = atomic_fetch_add_explicit(ptr, low, memory_order_relaxed); - high += (old_low + low < old_low) ? 1 : 0; - atomic_fetch_add_explicit(ptr + 1, high, memory_order_relaxed); - } -}; - -} // namespace metal -} // namespace c10 - struct IndexAB { constant int64_t* indexArray; }; @@ -234,13 +211,15 @@ REGISTER_INDEX_OP_ALL_DTYPES(put_serial); REGISTER_INDEX_OP(put_accumulate, float, float); REGISTER_INDEX_OP(put_accumulate, half, half); +REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat); REGISTER_INDEX_OP(put_accumulate, long, long); REGISTER_INDEX_OP(put_accumulate, int, int); REGISTER_INDEX_OP(put_accumulate, short, short); REGISTER_INDEX_OP(put_accumulate, char, char); REGISTER_INDEX_OP(put_accumulate, uchar, uchar); REGISTER_INDEX_OP(put_accumulate, bool, bool); -REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat); +REGISTER_INDEX_OP(put_accumulate, float2, float2); +REGISTER_INDEX_OP(put_accumulate, half2, half2); template kernel void kernel_index_offsets( @@ -379,6 +358,7 @@ kernel void index_copy_strided( constant long* input_strides, constant long* output_strides, constant long* source_strides, + constant long& indices_stride, uint thread_index [[thread_position_in_grid]]) { int pos[max_ndim]; pos_from_thread_index(int(thread_index), pos, sizes, ndim); @@ -395,7 +375,7 @@ kernel void index_copy_strided( // find the last index in the indices array that equals this coordinate int last_matching_index = -1; for (uint i = 0; i < indices_numel; i++) { - if (indices[i] == orig_dim) { + if (indices[i * indices_stride] == orig_dim) { last_matching_index = int(i); } } @@ -434,6 +414,7 @@ kernel void index_copy_strided( constant long*, \ constant long*, \ constant long*, \ + constant long&, \ uint); #define REGISTER_MASKED_FILL_SCALAR(SIZE, DTYPE) \ diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal index 92774f3ff2668..4ba2bca720db7 100644 --- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal +++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal @@ -68,6 +68,37 @@ kernel void matmul( } } +template +kernel void addmm( + constant T* mat1Data [[buffer(0)]], + constant T* mat2Data [[buffer(1)]], + device T* outputData [[buffer(2)]], + constant T* biasData [[buffer(3)]], + constant array, 2>& alpha_beta [[buffer(4)]], + constant array& strides [[buffer(5)]], + constant uint3& sizes [[buffer(6)]], + uint2 tid [[thread_position_in_threadgroup]], + uint2 thread_id [[thread_position_in_grid]]) { + threadgroup T A_tile[TILE_DIM][TILE_DIM]; + threadgroup T B_tile[TILE_DIM][TILE_DIM]; + + auto sum = matmul_inner( + mat1Data, + mat2Data, + reinterpret_cast&>(strides), + sizes, + A_tile, + B_tile, + tid, + thread_id); + if (thread_id.y < sizes.x && thread_id.x < sizes.z) { + auto bias = + biasData[thread_id.y * strides[3].x + thread_id.x * strides[3].y]; + outputData[thread_id.y * strides[2].x + thread_id.x * strides[2].y] = + static_cast(alpha_beta[0] * sum + alpha_beta[1] * bias); + } +} + template kernel void naive_bmm( constant T* mat1Data [[buffer(0)]], @@ -613,17 +644,15 @@ kernel void applyPivots( } } -#define INSTANTIATE_NAIVE_MM(DTYPE) \ - template [[host_name("matmul_" #DTYPE)]] kernel void matmul( \ - constant DTYPE * mat1Data [[buffer(0)]], \ - constant DTYPE * mat2Data [[buffer(1)]], \ - device DTYPE * outputData [[buffer(2)]], \ - constant array & strides [[buffer(3)]], \ - constant uint3 & sizes [[buffer(4)]], \ - uint2 tid [[thread_position_in_threadgroup]], \ - uint2 group_id [[threadgroup_position_in_grid]]) - -#define INSTANTIATE_NAIVE_BMM(DTYPE) \ +#define INSTANTIATE_MM_OPS(DTYPE) \ + template [[host_name("matmul_" #DTYPE)]] kernel void matmul( \ + constant DTYPE * mat1Data [[buffer(0)]], \ + constant DTYPE * mat2Data [[buffer(1)]], \ + device DTYPE * outputData [[buffer(2)]], \ + constant array & strides [[buffer(3)]], \ + constant uint3 & sizes [[buffer(4)]], \ + uint2 tid [[thread_position_in_threadgroup]], \ + uint2 group_id [[threadgroup_position_in_grid]]); \ template [[host_name("naive_bmm_" #DTYPE)]] kernel void naive_bmm( \ constant DTYPE * mat1Data [[buffer(0)]], \ constant DTYPE * mat2Data [[buffer(1)]], \ @@ -631,20 +660,26 @@ kernel void applyPivots( constant array & strides [[buffer(3)]], \ constant uint4 & sizes [[buffer(4)]], \ uint3 tid [[thread_position_in_threadgroup]], \ - uint3 group_id [[threadgroup_position_in_grid]]) + uint3 group_id [[threadgroup_position_in_grid]]); \ + template [[host_name("addmm_" #DTYPE)]] kernel void addmm( \ + constant DTYPE * mat1Data [[buffer(0)]], \ + constant DTYPE * mat2Data [[buffer(1)]], \ + device DTYPE * outputData [[buffer(2)]], \ + constant DTYPE * biasData [[buffer(3)]], \ + constant array, 2> & \ + alpha_beta [[buffer(4)]], \ + constant array & strides [[buffer(5)]], \ + constant uint3 & sizes [[buffer(6)]], \ + uint2 tid [[thread_position_in_threadgroup]], \ + uint2 group_id [[threadgroup_position_in_grid]]) -INSTANTIATE_NAIVE_MM(float); -INSTANTIATE_NAIVE_MM(half); -INSTANTIATE_NAIVE_MM(bfloat); +INSTANTIATE_MM_OPS(float); +INSTANTIATE_MM_OPS(half); +INSTANTIATE_MM_OPS(bfloat); // Integral MM -INSTANTIATE_NAIVE_MM(short); -INSTANTIATE_NAIVE_MM(int); -INSTANTIATE_NAIVE_MM(long); -INSTANTIATE_NAIVE_MM(char); -INSTANTIATE_NAIVE_MM(uchar); -INSTANTIATE_NAIVE_BMM(short); -INSTANTIATE_NAIVE_BMM(int); -INSTANTIATE_NAIVE_BMM(long); -INSTANTIATE_NAIVE_BMM(char); -INSTANTIATE_NAIVE_BMM(uchar); +INSTANTIATE_MM_OPS(long); +INSTANTIATE_MM_OPS(int); +INSTANTIATE_MM_OPS(short); +INSTANTIATE_MM_OPS(char); +INSTANTIATE_MM_OPS(uchar); diff --git a/aten/src/ATen/native/mps/kernels/Pooling.metal b/aten/src/ATen/native/mps/kernels/Pooling.metal index 4eec3ed4d1b6e..3eee8bb079a7a 100644 --- a/aten/src/ATen/native/mps/kernels/Pooling.metal +++ b/aten/src/ATen/native/mps/kernels/Pooling.metal @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -88,6 +89,53 @@ void max_pool_3d_input_iter( } } +template +void max_pool_2d_input_iter( + constant T* input, + device T* output, + device int64_t* indices, + constant int32_t* input_sizes, + constant int32_t* input_strides, + thread int32_t (&pooling_dim_indices)[3], + constant int32_t* kernel_size, + constant int32_t* stride, + constant int32_t* padding, + constant int32_t* dilation) { + auto bounds0 = get_input_iter_bounds<0>( + input_sizes, pooling_dim_indices, kernel_size, stride, padding, dilation); + auto bounds1 = get_input_iter_bounds<1>( + input_sizes, pooling_dim_indices, kernel_size, stride, padding, dilation); + + auto d0 = dilation[0]; + auto d1 = dilation[1]; + + T max_value = input + [input_strides[0] * bounds0.start + input_strides[1] * bounds1.start]; + auto max_index = bounds0.start * input_sizes[1] + bounds1.start; + + for (auto i0 = bounds0.start; i0 < bounds0.end; i0 += d0) { + auto offset0 = input_strides[0] * i0; + + for (auto i1 = bounds1.start; i1 < bounds1.end; i1 += d1) { + auto offset1 = input_strides[1] * i1; + + auto input_value = input[offset0 + offset1]; + bool is_greater = input_value > max_value; + + max_value = is_greater ? input_value : max_value; + + if (return_indices) { + auto input_index = i0 * input_sizes[1] + i1; + max_index = is_greater ? input_index : max_index; + } + } + } + *output = max_value; + if (return_indices) { + *indices = max_index; + } +} + struct PoolOffsets { int32_t output; int32_t indices; @@ -212,7 +260,7 @@ kernel void max_pool( PoolOffsets offsets = find_pool_offsets( output_sizes, output_strides, - indices_strides, + return_indices ? indices_strides : nullptr, input_strides, pooling_dim_indices, dims, @@ -224,18 +272,47 @@ kernel void max_pool( indices += offsets.indices; input += offsets.input_leading; - max_pool_3d_input_iter( - input, - output, - indices, - input_sizes + leading_dims, - input_strides + leading_dims, - pooling_dim_indices, - kernel_size, - stride, - padding, - dilation, - return_indices); + switch (pooling_dims) { + case 2: + if (return_indices) { + return max_pool_2d_input_iter( + input, + output, + indices, + input_sizes + leading_dims, + input_strides + leading_dims, + pooling_dim_indices, + kernel_size, + stride, + padding, + dilation); + } else { + return max_pool_2d_input_iter( + input, + output, + indices, + input_sizes + leading_dims, + input_strides + leading_dims, + pooling_dim_indices, + kernel_size, + stride, + padding, + dilation); + } + case 3: + return max_pool_3d_input_iter( + input, + output, + indices, + input_sizes + leading_dims, + input_strides + leading_dims, + pooling_dim_indices, + kernel_size, + stride, + padding, + dilation, + return_indices); + } } // Finds the element in the grad input which corresponds to the index into the @@ -426,8 +503,8 @@ void avg_pool_3d_input_iter( padding, count_include_pad); - T value_sum = 0; - auto divisor = has_divisor_override + opmath_t value_sum = 0; + opmath_t divisor = has_divisor_override ? divisor_override : (bounds0.count) * (bounds1.count) * (bounds2.count); @@ -440,11 +517,58 @@ void avg_pool_3d_input_iter( for (auto i2 = bounds2.start; i2 < bounds2.end; i2++) { auto offset2 = input_strides[2] * i2; auto input_value = input[offset0 + offset1 + offset2]; - value_sum += input_value; + value_sum += static_cast>(input_value); } } } - *output = value_sum / static_cast(divisor); + *output = static_cast(value_sum / divisor); +} + +// Iterates through all the input elements that this kernel needs to +// apply max to. Specialized for 2 pooling dimensions. +template +void avg_pool_2d_input_iter( + constant T* input, + device T* output, + constant int32_t* input_sizes, + constant int32_t* input_strides, + thread int32_t (&pooling_dim_indices)[3], + constant int32_t* kernel_size, + constant int32_t* stride, + constant int32_t* padding, + bool count_include_pad, + bool has_divisor_override, + int32_t divisor_override) { + auto bounds0 = get_avg_pool_input_iter_bounds<0>( + input_sizes, + pooling_dim_indices, + kernel_size, + stride, + padding, + count_include_pad); + auto bounds1 = get_avg_pool_input_iter_bounds<1>( + input_sizes, + pooling_dim_indices, + kernel_size, + stride, + padding, + count_include_pad); + + opmath_t value_sum = 0; + opmath_t divisor = has_divisor_override + ? divisor_override + : (bounds0.count) * (bounds1.count); + + for (auto i0 = bounds0.start; i0 < bounds0.end; i0++) { + auto offset0 = input_strides[0] * i0; + + for (auto i1 = bounds1.start; i1 < bounds1.end; i1++) { + auto offset1 = input_strides[1] * i1; + auto input_value = input[offset0 + offset1]; + value_sum += static_cast>(input_value); + } + } + *output = static_cast(value_sum / divisor); } template @@ -543,18 +667,33 @@ kernel void avg_pool( input_sizes += leading_dims; input_strides += leading_dims; - avg_pool_3d_input_iter( - input, - output, - input_sizes, - input_strides, - pooling_dim_indices, - kernel_size, - stride, - padding, - params.count_include_pad, - params.has_divisor_override, - params.divisor_override); + if (pooling_dims == 3) { + avg_pool_3d_input_iter( + input, + output, + input_sizes, + input_strides, + pooling_dim_indices, + kernel_size, + stride, + padding, + params.count_include_pad, + params.has_divisor_override, + params.divisor_override); + } else if (pooling_dims == 2) { + avg_pool_2d_input_iter( + input, + output, + input_sizes, + input_strides, + pooling_dim_indices, + kernel_size, + stride, + padding, + params.count_include_pad, + params.has_divisor_override, + params.divisor_override); + } } template diff --git a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal index 23c4810a24963..7db38da80532f 100644 --- a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal +++ b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal @@ -490,11 +490,6 @@ struct bitwise_not_functor { } }; -template -float erfc(T x) { - return 1.0 - erf(x); -} - struct round_decimals_functor { template inline T operator()(const T x, const long ndigits) { @@ -503,6 +498,17 @@ struct round_decimals_functor { } }; +struct round_functor { + template , bool> = true> + inline T operator()(const T x) { + return static_cast(rint(float(x))); + } + template , bool> = true> + inline T operator()(const T x) { + return x; + } +}; + DEFINE_UNARY_FLOATING_FUNCTOR(erf); DEFINE_UNARY_FLOATING_FUNCTOR(erfc); DEFINE_UNARY_FLOATING_FUNCTOR(erfinv); @@ -515,6 +521,13 @@ REGISTER_UNARY_OP(neg, char, char); REGISTER_UNARY_OP(neg, uchar, uchar); REGISTER_UNARY_OP(neg, float, float); REGISTER_UNARY_OP(neg, half, half); +REGISTER_UNARY_OP(round, int, int); +REGISTER_UNARY_OP(round, long, long); +REGISTER_UNARY_OP(round, short, short); +REGISTER_UNARY_OP(round, char, char); +REGISTER_UNARY_OP(round, uchar, uchar); +REGISTER_UNARY_OP(round, float, float); +REGISTER_UNARY_OP(round, half, half); REGISTER_UNARY_OP(bitwise_not, int, int); REGISTER_UNARY_OP(bitwise_not, long, long); @@ -558,6 +571,7 @@ REGISTER_UNARY_OP(abs, half, half); INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat); REGISTER_UNARY_OP(neg, bfloat, bfloat); +REGISTER_UNARY_OP(round, bfloat, bfloat); REGISTER_UNARY_OP(abs, bfloat, bfloat); INSTANTIATE_UNARY_KERNELS2(half, half); INSTANTIATE_UNARY_KERNELS2(float, float); diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm index 806eeb82e1d17..0b303f48028f4 100644 --- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm +++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm @@ -53,6 +53,7 @@ void binary_op_kernel(const std::string func_name, .add_input(input) .add_input(other) .check_all_same_dtype(false) + .promote_inputs_to_common_dtype(true) .build(); lib.exec_binary_kernel(iter, func_name, alpha); @@ -167,6 +168,10 @@ static void lerp_scalar_mps_kernel(at::TensorIteratorBase& iter, const Scalar& w lib.exec_binary_kernel(iter, "lerp_alpha", weight); } +static void native_dropout_mask_and_scale_mps_kernel(at::TensorIteratorBase& iter, const Scalar& scale) { + lib.exec_binary_kernel(iter, "native_dropout_mask_and_scale", scale); +} + static void mul_mps_kernel(TensorIteratorBase& iter) { lib.exec_binary_kernel(iter, "mul"); } @@ -191,6 +196,14 @@ static void fmod_mps_kernel(TensorIteratorBase& iter) { lib.exec_binary_kernel(iter, "fmod"); } +static void igamma_mps_kernel(TensorIteratorBase& iter) { + lib.exec_binary_kernel(iter, "igamma"); +} + +static void igammac_mps_kernel(TensorIteratorBase& iter) { + lib.exec_binary_kernel(iter, "igammac"); +} + REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel) REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel) REGISTER_DISPATCH(copysign_stub, ©sign_mps_kernel) @@ -216,4 +229,6 @@ static void fmod_mps_kernel(TensorIteratorBase& iter) { REGISTER_DISPATCH(div_trunc_stub, &div_trunc_mps_kernel) REGISTER_DISPATCH(fmod_stub, &fmod_mps_kernel) REGISTER_DISPATCH(remainder_stub, &remainder_mps_kernel) +REGISTER_DISPATCH(igamma_stub, &igamma_mps_kernel) +REGISTER_DISPATCH(igammac_stub, &igammac_mps_kernel) } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm index a9589ecc490ee..06b6edcff9407 100644 --- a/aten/src/ATen/native/mps/operations/BinaryOps.mm +++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm @@ -48,28 +48,11 @@ #define BinaryOpFn(graph, primary, secondary) \ MPSGraphTensor*(mps::BinaryOpCachedGraph * graph, MPSGraphTensor * primary, MPSGraphTensor * secondary) -static inline Tensor legacy_complex_as_view(const Tensor& t) { - // Convert non-complex types (and cdouble CPU scalars) to cfloat - if (!isComplexType(t.scalar_type()) || t.scalar_type() == kComplexDouble) { - return at::view_as_real(t.to(kMPS, kComplexFloat)); - } - return at::view_as_real(t.dim() != 0 ? t : t.to(kMPS)); -} - static void binaryOpTensor(const Tensor& self, const Tensor& other, const Tensor& output_, std::string op_name, BinaryOpBlock binaryBlock) { - TORCH_CHECK(!(op_name == "power" && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS) && - (self.scalar_type() == ScalarType::Long || - (other.scalar_type() == ScalarType::Long && - (self.scalar_type() != ScalarType::Half && self.scalar_type() != ScalarType::Float)))), - "MPS: ", - op_name, - " op with int64 input is supported natively starting from macOS 13.2"); - TORCH_CHECK_TYPE(!isComplexType(self.scalar_type()) || mps::supportsComplex(), - "Complex types are supported starting from MacOS 14.0+"); MPSStream* mpsStream = getCurrentMPSStream(); const bool is_self_scalar = self.dim() == 0; diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm index f167067216d48..101ef5feb224e 100644 --- a/aten/src/ATen/native/mps/operations/Blas.mm +++ b/aten/src/ATen/native/mps/operations/Blas.mm @@ -51,9 +51,6 @@ inline void dot_check(const Tensor& self, const Tensor& other) { } // namespace mps Tensor dot_mps(const Tensor& self, const Tensor& other) { - TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) || self.scalar_type() != ScalarType::Long, - "MPS: dot op doesn't support int64 input on MacOS13") - using namespace mps; using CachedGraph = MPSBinaryCachedGraph; diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm index 97d562730dd8a..d572d52d103a1 100644 --- a/aten/src/ATen/native/mps/operations/Convolution.mm +++ b/aten/src/ATen/native/mps/operations/Convolution.mm @@ -124,7 +124,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_, IntArrayRef dilation, int64_t groups, std::optional input_shape) { - const bool is_macOS_13_2_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS); const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS); Tensor input_t = input_t_; bool is3DConv = input_t.dim() == 5; @@ -132,9 +131,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_, input_t = input_t.contiguous(); } - TORCH_CHECK(((input_t.dim() < 5) || is_macOS_13_2_or_newer), - "Conv3D is only supported on MPS for MacOS_13_2 or newer"); - TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types"); using namespace at::native::mps; diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm index 4f879c3b63b02..0c121cee8fb62 100644 --- a/aten/src/ATen/native/mps/operations/Copy.mm +++ b/aten/src/ATen/native/mps/operations/Copy.mm @@ -60,7 +60,6 @@ static void copy_cast_mps(at::Tensor& dst, outputTensor = [mpsGraph castTensor:outputTensor toType:dstDType name:@"cast"]; } if (needs_conj) { - TORCH_CHECK(supportsComplex(), "MPS complex tensors conjugation needs MacOS14+"); outputTensor = [mpsGraph conjugateWithTensor:outputTensor name:nil]; } @@ -275,24 +274,7 @@ void copy_blit_mps(void* dst, const void* src, size_t size) { // for GPU to GPU copies we only encode to stream's command buffer (no flushing) stream->copy(sourceBuffer, destBuffer, src.nbytes(), src_byte_offset, dst_byte_offset, profile_id); } else { - // Simulate cast to Complex on older MacOS by initializing real and imag parts - if (dst_.is_complex() && !supportsComplex()) { - if (!src.is_complex()) { - at::real(dst_).copy_(src); - at::imag(dst_).fill_(0); - } else if (src.is_conj() || dst_.is_conj()) { - // One cannot take view of conjugated tensor, but for some reason real and imag views are fine - // Use this to implement a conjugation - at::real(dst_).copy_(at::real(src)); - if (src.is_conj() != dst_.is_conj()) { - at::imag(dst_).copy_(at::neg(at::imag(src))); - } else { - at::imag(dst_).copy_(at::imag(src)); - } - } else { - at::view_as_real(dst_).copy_(at::view_as_real(src)); - } - } else if (dst_byte_offset) { + if (dst_byte_offset) { auto maybeCastedSource = at::empty(dst_.sizes(), dst_.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt); auto maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource); diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm index d072e5a40ac96..4d3f99ea9e02d 100644 --- a/aten/src/ATen/native/mps/operations/Distributions.mm +++ b/aten/src/ATen/native/mps/operations/Distributions.mm @@ -87,7 +87,6 @@ case kFloat: return MPSDataTypeFloat32; case kBFloat16: { - checkSupportsBFloat16(); return MPSDataTypeBFloat16; } default: diff --git a/aten/src/ATen/native/mps/operations/Dropout.mm b/aten/src/ATen/native/mps/operations/Dropout.mm new file mode 100644 index 0000000000000..116367d809eb5 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Dropout.mm @@ -0,0 +1,45 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + +namespace at::native { + +static Tensor native_dropout_mask_and_scale(const Tensor& input, const Tensor& mask, float scale) { + auto output = at::empty_like(input); + mps::binary_op_kernel("native_dropout_mask_and_scale", input, mask, output, scale); + return output; +} + +std::tuple native_dropout_mps(const Tensor& input, double p, std::optional train) { + if (input.numel() == 0 || !train.value_or(false) || p == 0) { + return {input.clone(), at::ones_like(input, input.options().dtype(c10::kBool))}; + } + + float p_comp = 1.0f - p; + Tensor mask = at::empty_like(input, input.options().dtype(c10::kBool)); + mask.bernoulli_(p_comp); + auto scale = p_comp == 0 ? 0.0f : 1.0f / p_comp; + Tensor output = native_dropout_mask_and_scale(input, mask, scale); + return {std::move(output), std::move(mask)}; +} + +Tensor native_dropout_backward_mps(const Tensor& grad, const Tensor& mask, double scale) { + auto grad_float = isFloatingType(grad.scalar_type()) ? grad : grad.to(c10::kFloat); + return native_dropout_mask_and_scale(grad_float, mask, scale); +} + +} // namespace at::native \ No newline at end of file diff --git a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm index a9ac701106170..7e9867c9b948d 100644 --- a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm +++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm @@ -88,7 +88,6 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, // TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237 Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) { - TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+"); auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" + std::to_string(normalization) + ":" + std::to_string(onesided); @autoreleasepool { @@ -129,7 +128,6 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t normalization, int64_t last_dim_size, Tensor& out) { - TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+"); auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" + std::to_string(normalization) + ":" + std::to_string(last_dim_size); @autoreleasepool { @@ -155,7 +153,6 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, } Tensor& _fft_c2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward, Tensor& out) { - TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+"); auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" + std::to_string(normalization) + ":" + std::to_string(forward); @autoreleasepool { diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm index 1e701d314354d..ef85633889487 100644 --- a/aten/src/ATen/native/mps/operations/GridSampler.mm +++ b/aten/src/ATen/native/mps/operations/GridSampler.mm @@ -1,7 +1,10 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -9,9 +12,17 @@ #else #include #include +#include #endif namespace at::native { + +#ifndef PYTORCH_JIT_COMPILE_SHADERS +static auto& lib = mps::MetalShaderLibrary::getBundledLibrary(); +#else +#include +#endif + namespace mps { static void grid_sampler_2d_mps_impl(Tensor& output, const Tensor& input, @@ -120,6 +131,96 @@ static void grid_sampler_2d_mps_impl(Tensor& output, runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } + +static void grid_sampler_template(Tensor& output, + const Tensor& input, + const Tensor& grid, + int64_t _interpolation_mode, + int64_t _padding_mode, + bool align_corners, + int32_t sampler_dims, + const std::string& op_name) { + check_grid_sampler_common(input, grid); + switch (sampler_dims) { + case 2: + check_grid_sampler_2d(input, grid); + break; + case 3: + check_grid_sampler_3d(input, grid, _interpolation_mode); + break; + default: + TORCH_INTERNAL_ASSERT(false, "Only 2D and 3D sampling are supported, but got: ", sampler_dims); + } + TORCH_CHECK(input.scalar_type() == grid.scalar_type(), + "expected input and grid to have the same type, but got ", + input.scalar_type(), + " and ", + grid.scalar_type()); + + auto interpolation_mode = static_cast(_interpolation_mode); + auto padding_mode = static_cast(_padding_mode); + + switch (interpolation_mode) { + case GridSamplerInterpolation::Bilinear: + break; + case GridSamplerInterpolation::Nearest: + TORCH_CHECK(false, op_name, ": Unsupported Nearest interpolation"); + break; + case GridSamplerInterpolation::Bicubic: + TORCH_CHECK(false, op_name, ": Unsupported Bicubic interpolation"); + break; + default: + TORCH_CHECK(false, op_name, ": Unrecognised interpolation mode: ", _interpolation_mode); + } + + switch (padding_mode) { + case GridSamplerPadding::Zeros: + case GridSamplerPadding::Border: + case GridSamplerPadding::Reflection: + break; + default: + TORCH_CHECK(false, op_name, ": Unrecognised Padding Mode: ", _padding_mode); + } + + auto input_size = input.sizes(); + auto grid_size = grid.sizes(); + output.resize_({input_size[0], input_size[1], grid_size[1], grid_size[2], grid_size[3]}, MemoryFormat::Contiguous); + + auto dims = input.dim(); + + GridSamplerParams<5> params; + params.sampler_dims = sampler_dims; + params.padding_mode = padding_mode; + params.interpolation_mode = interpolation_mode; + params.align_corners = align_corners; + + for (const auto dim : c10::irange(dims)) { + params.output_sizes[dim] = safe_downcast(output.size(dim)); + params.output_strides[dim] = safe_downcast(output.stride(dim)); + params.input_sizes[dim] = safe_downcast(input.size(dim)); + params.input_strides[dim] = safe_downcast(input.stride(dim)); + params.grid_sizes[dim] = safe_downcast(grid.size(dim)); + params.grid_strides[dim] = safe_downcast(grid.stride(dim)); + } + + auto num_threads = output.numel(); + MPSStream* mpsStream = getCurrentMPSStream(); + + dispatch_sync_with_rethrow(mpsStream->queue(), ^() { + @autoreleasepool { + id computeEncoder = mpsStream->commandEncoder(); + auto pso = lib.getPipelineStateForFunc("grid_sampler_" + scalarToMetalTypeString(input)); + + getMPSProfiler().beginProfileKernel(pso, op_name, {input, grid}); + [computeEncoder setComputePipelineState:pso]; + mtl_setArgs(computeEncoder, output, input, grid, params); + + mtl_dispatch1DJob(computeEncoder, pso, num_threads); + getMPSProfiler().endProfileKernel(pso); + } + }); +} + } // namespace mps Tensor grid_sampler_2d_mps(const Tensor& input, @@ -127,15 +228,6 @@ Tensor grid_sampler_2d_mps(const Tensor& input, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { - if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS)) { - TORCH_WARN_ONCE("MPS: grid_sampler_2d op is supported natively starting from macOS 13.2. ", - "Falling back on CPU. This may have performance implications."); - - return at::grid_sampler_2d(input.to("cpu"), grid.to("cpu"), interpolation_mode, padding_mode, align_corners) - .clone() - .to("mps"); - } - auto in_size = input.sizes(); auto grid_size = grid.sizes(); auto output = at::empty({in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options()); @@ -144,4 +236,21 @@ Tensor grid_sampler_2d_mps(const Tensor& input, return output; } +Tensor grid_sampler_3d_mps(const Tensor& input, + const Tensor& grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners) { + auto output = at::empty({0}, input.options(), MemoryFormat::Contiguous); + mps::grid_sampler_template(output, + input, + grid, + interpolation_mode, + padding_mode, + align_corners, + /*sampler_dims=*/3, + /*op_name=*/"grid_sampler_3d"); + return output; +} + } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm index f00d155559da0..fa19d2f4d127f 100644 --- a/aten/src/ATen/native/mps/operations/Indexing.mm +++ b/aten/src/ATen/native/mps/operations/Indexing.mm @@ -108,26 +108,12 @@ static void validateInputData(const TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride, - const std::string& op, - bool accumulate) { - using namespace mps; - + const std::string& op) { const auto num_indices = index_size.size(); TORCH_CHECK(num_indices <= 16, "Current limit allows up to 16 indices to be used in MPS indexing kernels"); AT_ASSERT(num_indices == index_stride.size()); AT_ASSERT(static_cast(num_indices) == iter.ntensors() - 2); - const Tensor& inputTensor = iter.tensor(1); - const auto scalar_type = inputTensor.scalar_type(); - - if (accumulate) { - // No atomic support for the complex dtypes - TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || supportedFloatingType(scalar_type)); - } else { - TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || supportedFloatingType(scalar_type) || - scalar_type == ScalarType::ComplexFloat || scalar_type == ScalarType::ComplexHalf, - getMPSTypeString(inputTensor) + std::string(" not supported for index.Tensor_out")); - } } static Tensor& masked_select_out_mps_impl(Tensor& result, const Tensor& self, const Tensor& mask) { @@ -158,7 +144,7 @@ static void dispatch_index_kernel(TensorIteratorBase& iter, IntArrayRef index_stride, const std::string& kernel_name, const bool serial = false) { - validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false); + validateInputData(iter, index_size, index_stride, "index.Tensor_out"); if (iter.numel() == 0) return; if (!iter.can_use_32bit_indexing()) { @@ -200,7 +186,7 @@ static void dispatch_index_kernel(TensorIteratorBase& iter, } static void index_kernel_mps(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride) { - validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false); + validateInputData(iter, index_size, index_stride, "index.Tensor_out"); dispatch_index_kernel( iter, index_size, index_stride, fmt::format("index_select_{}", getBitSizeString(iter.tensor_base(0)))); } @@ -210,7 +196,7 @@ static void index_put_kernel_mps(TensorIterator& iter, IntArrayRef index_stride, bool accumulate) { @autoreleasepool { - validateInputData(iter, index_size, index_stride, "index_put_impl", accumulate); + validateInputData(iter, index_size, index_stride, "index_put_impl"); if (accumulate) { dispatch_index_kernel(iter, index_size, @@ -244,7 +230,7 @@ static void index_put_kernel_mps(TensorIterator& iter, index.numel()); int64_t idx = index.item(); TORCH_CHECK(idx == 0, "index_copy_(): the only valid index for a 0-dim tensor is 0, but got ", idx); - result.copy_(source); + result.copy_(source.squeeze()); return; } @@ -268,11 +254,12 @@ static void index_put_kernel_mps(TensorIterator& iter, } } - TORCH_CHECK(source.size(dim) == index.numel(), + const auto source_size_dim = source.dim() > 0 ? source.size(dim) : 1; + TORCH_CHECK(index.numel() == source_size_dim, "index_copy_(): Number of indices (", index.numel(), ") should be equal to source.size(dim) (", - source.size(dim), + source_size_dim, ")"); auto stream = getCurrentMPSStream(); @@ -295,7 +282,7 @@ static void index_put_kernel_mps(TensorIterator& iter, [computeEncoder setComputePipelineState:indexCopyPSO]; mtl_setArgs(computeEncoder, result, self, source, index, dim_arg, self.sizes(), ndim, indices_numel); if (!is_dense) { - mtl_setArgs<8>(computeEncoder, self.strides(), result.strides(), source.strides()); + mtl_setArgs<8>(computeEncoder, self.strides(), result.strides(), source.strides(), index.strides()); } mtl_dispatch1DJob(computeEncoder, indexCopyPSO, result.numel()); } @@ -353,14 +340,7 @@ static Tensor nonzero_fallback(const Tensor& self) { } Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_) { - if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) { - TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 14.0. ", - "Falling back on CPU. This may have performance implications."); - Tensor out_fallback = nonzero_fallback(self); - at::native::resize_output(out_, out_fallback.sizes()); - out_.copy_(out_fallback); - return out_; - } else if (self.is_complex()) { + if (self.is_complex()) { TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes. ", "Falling back on CPU. This may have performance implications."); Tensor out_fallback = nonzero_fallback(self); @@ -445,11 +425,7 @@ static Tensor nonzero_fallback(const Tensor& self) { } Tensor nonzero_mps(const Tensor& self) { - if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) { - TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 14.0. ", - "Falling back on CPU. This may have performance implications."); - return nonzero_fallback(self); - } else if (self.is_complex()) { + if (self.is_complex()) { TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes ", "Falling back on CPU. This may have performance implications."); return nonzero_fallback(self); @@ -537,7 +513,28 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) { return; } - TORCH_CHECK(source.scalar_type() != ScalarType::Long, "index_add(): Expected non int64 dtype for source."); + bool use_deterministic_algorithm = globalContext().deterministicAlgorithms(); + + // TODO: Do not use deterministic algorithm for long/complex but rather implement it as Metal shader + use_deterministic_algorithm |= source.scalar_type() == ScalarType::Long; + use_deterministic_algorithm |= c10::isComplexType(source.scalar_type()); + + if (use_deterministic_algorithm) { + if (!result.is_same(self)) { + result.copy_(self); + } + torch::List> indices; + indices.reserve(dim + 1); + for (const auto i : c10::irange(dim)) { + indices.emplace_back(); + } + indices.emplace_back(index.to(at::kLong)); + const Tensor result_ = (result.dim() == 0) ? result.view(1) : result; + const Tensor source_ = (source.dim() == 0) ? source.view(1) : source; + result_.index_put_(indices, source_.mul(alpha), true); + return; + } + auto casted_type = isFloatingType(source.scalar_type()) ? ScalarType::Float : ScalarType::Int; struct CachedGraph : public MPSCachedGraph { @@ -599,28 +596,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) { } Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) { - IntArrayRef input_shape = self.sizes(); - auto num_input_dims = input_shape.size(); - - auto num_indices = index.numel(); - TORCH_CHECK_INDEX(index.dim() <= 1, "index_select(): Index is supposed to be a vector"); - - dim = maybe_wrap_dim(dim, self.dim()); - std::vector shape_data(num_input_dims); - - // Calculate new shape - for (const auto i : c10::irange(num_input_dims)) { - if (i == static_cast(dim)) { - shape_data[i] = num_indices; - } else { - shape_data[i] = input_shape[i]; - } - } - - IntArrayRef output_shape = IntArrayRef(shape_data.data(), num_input_dims); - - Tensor result = at::empty(output_shape, self.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt); - + Tensor result = at::empty({0}, self.options()); index_select_out_mps(self, dim, index, result); return result; } @@ -642,25 +618,11 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) { TORCH_CHECK(self.scalar_type() == output.scalar_type(), "index_select(): self and output must have the same scalar type"); TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor"); - TORCH_CHECK(output.dim() == 0 || index.size(-1) == output.size(dim), - "index_select(): index and output must have the same size at `dim`th dimension, but got ", - index.size(-1), - " and ", - output.size(dim), - "."); - - for (const auto i : irange(self.dim())) { - if (i == dim) - continue; - TORCH_CHECK(self.size(i) == output.size(i), - "index_select(): self and output must have the same dimensions except for `dim`th dimension, but got ", - self.size(i), - " and ", - output.size(i), - " at dimension ", - i, - "."); + auto output_size = self.sizes().vec(); + if (self.dim() > 0) { + output_size[dim] = num_indices; } + at::native::resize_output(output, output_size); // Empty index if (num_indices == 0 || self.numel() == 0) { @@ -946,6 +908,8 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_, TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_fill_(): Expected dtype int32 or int64 for index"); TORCH_CHECK(dim == 0 || dim < self.dim(), "index_fill_(): Indexing dim ", dim, " is out of bounds of tensor"); + // MPS.scatter crashes if used with complex dtypes + TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "index_fill_(): Complex types are yet not supported"); // Empty index if (num_indices == 0) { diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm index 42769c13f1e1b..219086edd8e37 100644 --- a/aten/src/ATen/native/mps/operations/Linear.mm +++ b/aten/src/ATen/native/mps/operations/Linear.mm @@ -115,7 +115,10 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt return output; } - if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) { + // No-graph execution causes nonsense if these are non-contiguous. + const bool is_contiguous = input.is_contiguous() && weight.is_contiguous() && bias.is_contiguous(); + + if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_contiguous) { _mps_linear_nograph(input, weight, bias, output); // Squeeze last dim of 1D linear return weight_arg.dim() != 1 ? output : output.squeeze(-1); diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm index 3cdf0021e987f..7a3dde679c05f 100644 --- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm +++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm @@ -112,6 +112,61 @@ return output; } +Tensor& do_metal_addmm(const Tensor& self, + const Tensor& other, + Tensor& output, + const Scalar& alpha, + const Scalar& beta, + const Tensor& bias) { + if (beta.toDouble() == 0 && alpha.toDouble() == 1) { + return do_metal_mm(self, other, output); + } + auto stream = getCurrentMPSStream(); + auto device = MPSDevice::getInstance()->device(); + auto matmulPSO = lib.getPipelineStateForFunc("addmm_" + mps::scalarToMetalTypeString(output)); + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + getMPSProfiler().beginProfileKernel(matmulPSO, "addmm", {self, other}); + auto computeEncoder = stream->commandEncoder(); + [computeEncoder setComputePipelineState:matmulPSO]; + std::array sizes = {static_cast(self.size(0)), + static_cast(self.size(1)), + static_cast(output.size(1))}; + std::array strides = {self.stride(0), + self.stride(1), + other.stride(0), + other.stride(1), + output.stride(0), + output.stride(1), + bias.stride(0), + bias.stride(1)}; + union { + std::array i64; + std::array i32; + std::array f32; + } alpha_beta; + if (output.scalar_type() == kLong) { + alpha_beta.i64 = {alpha.toLong(), beta.toLong()}; + } else if (c10::isIntegralType(output.scalar_type(), true)) { + alpha_beta.i32 = {alpha.toInt(), beta.toInt()}; + } else { + TORCH_INTERNAL_ASSERT(c10::isFloatingType(output.scalar_type())); + alpha_beta.f32 = {alpha.toFloat(), beta.toFloat()}; + } + constexpr uint32_t TILE_DIM = 16; // fastest performance from tests on multiple macs + uint32_t gridSizeX = (output.size(1) + TILE_DIM - 1) / TILE_DIM; + uint32_t gridSizeY = (self.size(0) + TILE_DIM - 1) / TILE_DIM; + + MTLSize threadsPerThreadgroup = MTLSizeMake(TILE_DIM, TILE_DIM, 1); + MTLSize threadgroupsPerGrid = MTLSizeMake(gridSizeX, gridSizeY, 1); + mtl_setArgs(computeEncoder, self, other, output, bias, alpha_beta.i64, strides, sizes); + [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadsPerThreadgroup]; + getMPSProfiler().endProfileKernel(matmulPSO); + } + }); + return output; +} + std::tuple do_mm(MPSGraph* graph, const Tensor& self, const Tensor& other) { @@ -644,7 +699,6 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const TORCH_CHECK(output.is_mps()); TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D"); - TORCH_CHECK(supportedFloatingOrComplexType(self), "MPS device does not support addmm for non-float input"); TensorArg args[]{{output, "out", 0}, {bias, "self", 1}, {self, "mat1", 2}, {other, "mat2", 3}}; checkAllSameGPU(__func__, args); @@ -671,6 +725,10 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const return output; } + if (use_metal_mm(self, other, output)) { + return do_metal_addmm(self, other, output, alpha, beta, *bias_); + } + bool is_beta_non_zero = beta.toDouble() != 0.0; struct CachedGraph : public mps::MPSCachedGraph { diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm index 0c2c25946bb4b..2945ebf715f27 100644 --- a/aten/src/ATen/native/mps/operations/Pad.mm +++ b/aten/src/ATen/native/mps/operations/Pad.mm @@ -460,6 +460,9 @@ Tensor replication_pad3d_backward_mps(const Tensor& grad_output, const Tensor& i // backward pass is explicitly handled in autograd by negating the "pad" argument Tensor constant_pad_nd_mps(const Tensor& self, IntArrayRef pad, const Scalar& value) { + if (pad.empty()) { + return self.clone(); + } if (pad.size() > 6) { TORCH_WARN_ONCE("MPS: The constant padding of more than 3 dimensions is not currently supported natively. ", "It uses View Ops default implementation to run. This may have performance implications."); diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm index b2bc870844a88..d916320b2e238 100644 --- a/aten/src/ATen/native/mps/operations/Pooling.mm +++ b/aten/src/ATen/native/mps/operations/Pooling.mm @@ -297,13 +297,13 @@ static PoolSizes process_pool_sizes(const Tensor& input, pooling_dims, " ints"); - TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 3, + TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == pooling_dims, op_name, ": stride must either be omitted, a single int, or a tuple of ", pooling_dims, " ints"); - TORCH_CHECK(padding.size() == 1 || padding.size() == 3, + TORCH_CHECK(padding.size() == 1 || padding.size() == pooling_dims, op_name, ": padding must either be a single int, or a tuple of ", pooling_dims, @@ -333,6 +333,22 @@ static PoolSizes process_pool_sizes(const Tensor& input, ": pad should be at most half of effective kernel size"); } + if (pooling_dims == 2) { + const auto memory_format = input.suggest_memory_format(); + bool valid_dims = input.size(1) != 0 && input.size(2) != 0; + if (memory_format == at::MemoryFormat::ChannelsLast) { + // Expect tensor in NHWC format and allow 0-dim only for N. + TORCH_CHECK((dims == 4 && valid_dims && input.size(3) != 0), + "Expected 4D (batch mode) tensor expected for input with channels_last layout" + " with optional 0 dim batch size for input, but got: ", + input.sizes()); + } else { + TORCH_CHECK((dims == 3 && input.size(0) != 0 && valid_dims) || (dims == 4 && valid_dims && input.size(3) != 0), + "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got:", + input.sizes()); + } + } + for (const auto dim : c10::irange(static_cast(leading_dims == 2), dims)) { TORCH_CHECK(input.size(dim) > 0, op_name, ": Expected input's non-batch dimensions to have positive length"); } @@ -786,6 +802,16 @@ static void avg_pool_backward_out_mps_template(const Tensor& grad_input, } // namespace mps +// TODO: The MPS graph impl can sometimes give significantly better performance +// than the Metal impl for cases where the stride is 1 in all dimensions. There +// may be a code path in the graph kernel that specifically optimizes for that +// case. We should look into implementing a specialized case in Metal so we can +// avoid using the graph impl. +static bool use_graph_for_max_pool2d(IntArrayRef kernel_size, IntArrayRef stride_) { + IntArrayRef stride = stride_.empty() ? kernel_size : stride_; + return (stride[0] == 1) && (stride.size() == 1 || stride[1] == 1); +} + Tensor mps_max_pool2d(const Tensor& input, IntArrayRef kernel_size, IntArrayRef stride, @@ -793,24 +819,37 @@ Tensor mps_max_pool2d(const Tensor& input, IntArrayRef dilation, bool ceil_mode) { Tensor output = at::empty({0}, input.options(), MemoryFormat::Contiguous); - mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) { - MPSGraph* mpsGraph = cachedGraph.graph(); - return [mpsGraph maxPooling2DWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil]; - }; - mps::pool2d_template(input, - output, - std::nullopt, - std::nullopt, - kernel_size, - stride, - padding, - dilation, - ceil_mode, - false, - std::nullopt, - pooling_op_block, - "max_pool2d"); - + bool use_graph = use_graph_for_max_pool2d(kernel_size, stride); + if (use_graph) { + mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) { + MPSGraph* mpsGraph = cachedGraph.graph(); + return [mpsGraph maxPooling2DWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil]; + }; + mps::pool2d_template(input, + output, + std::nullopt, + std::nullopt, + kernel_size, + stride, + padding, + dilation, + ceil_mode, + false, + std::nullopt, + pooling_op_block, + "max_pool2d"); + } else { + mps::max_pool_with_indices_out_mps_template(output, + std::nullopt, + input, + kernel_size, + stride, + padding, + dilation, + ceil_mode, + /*pooling_dims=*/2, + "max_pool2d"); + } return output; } @@ -855,32 +894,45 @@ Tensor mps_max_pool2d_backward(const Tensor& grad_output, bool ceil_mode, const Tensor& output, const Tensor& indices) { - auto indices_memory_format = indices.suggest_memory_format(); - - mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) { - MPSGraph* mpsGraph = cachedGraph.graph(); - NSArray* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:cachedGraph.inputTensor - descriptor:desc - name:nil]; - cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long); - return poolOutputs[0]; - }; - mps::pool2d_template(input, - output, - indices, - std::nullopt, - kernel_size, - stride, - padding, - dilation, - ceil_mode, - false, - std::nullopt, - pooling_op_block, - "max_pool2d_indices"); + bool use_graph = use_graph_for_max_pool2d(kernel_size, stride); + if (use_graph) { + auto indices_memory_format = indices.suggest_memory_format(); + + mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) { + MPSGraph* mpsGraph = cachedGraph.graph(); + NSArray* poolOutputs = + [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil]; + cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long); + return poolOutputs[0]; + }; + mps::pool2d_template(input, + output, + indices, + std::nullopt, + kernel_size, + stride, + padding, + dilation, + ceil_mode, + false, + std::nullopt, + pooling_op_block, + "max_pool2d_indices"); + if (indices_memory_format == MemoryFormat::ChannelsLast) { + const_cast(indices) = indices.to(MemoryFormat::ChannelsLast); + } - if (indices_memory_format == MemoryFormat::ChannelsLast) { - const_cast(indices) = indices.to(MemoryFormat::ChannelsLast); + } else { + mps::max_pool_with_indices_out_mps_template(output, + indices, + input, + kernel_size, + stride, + padding, + dilation, + ceil_mode, + /*pooling_dims=*/2, + "max_pool2d"); } } @@ -1085,17 +1137,30 @@ Tensor max_unpooling3d_forward_mps(const Tensor& self, bool count_include_pad, std::optional divisor_override, const Tensor& output) { - mps::avg_pool2d_template(input, - output, - std::nullopt, - {kH, kW}, - {dH, dW}, - {padH, padW}, - {1, 1}, - ceil_mode, - count_include_pad, - divisor_override, - "avg_pool2d"); + if (ceil_mode) { + mps::avg_pool_out_mps_template(output, + input, + {kH, kW}, + {dH, dW}, + {padH, padW}, + ceil_mode, + count_include_pad, + divisor_override, + /*pooling_dims=*/2, + "avg_pool3d"); + } else { + mps::avg_pool2d_template(input, + output, + std::nullopt, + {kH, kW}, + {dH, dW}, + {padH, padW}, + {1, 1}, + ceil_mode, + count_include_pad, + divisor_override, + "avg_pool2d"); + } } TORCH_IMPL_FUNC(avg_pool2d_backward_out_mps) diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm index 21020bad467d0..ae13504d9003e 100644 --- a/aten/src/ATen/native/mps/operations/ReduceOps.mm +++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm @@ -152,8 +152,6 @@ static void reduction_out_mps(const Tensor& input_t, const Tensor& output_t, MPSReductionType reduction_type, const std::string& func_name) { - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, func_name); // NS: TODO: get rid of all those shenanigans and just call reduction_op with view tensor bool canSqueezeLastDim = true; IntArrayRef input_shape = input_t.sizes(); @@ -236,12 +234,10 @@ static void reduction_out_mps(const Tensor& input_t, MPSGraphTensor* castInputTensor = inputTensor; MPSDataType inputCastType = MPSDataTypeInvalid; if (dtype.has_value() && - (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt || - (dtype.value() == kLong && macOS13_3_plus))) { + (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt || dtype.value() == kLong)) { inputCastType = getMPSDataType(dtype.value()); } else if (inputScalarType != kInt && inputScalarType != kHalf && inputScalarType != kFloat && - inputScalarType != kComplexFloat && inputScalarType != kComplexHalf && - (inputScalarType != kLong || !macOS13_3_plus)) { + inputScalarType != kComplexFloat && inputScalarType != kComplexHalf && inputScalarType != kLong) { inputCastType = getMPSDataType(kFloat); } @@ -460,7 +456,7 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t, errMessage += ": reduction dim must be in the range of input shape"; for (const auto dim : dim_value) { auto wrap_dim = maybe_wrap_dim(dim, num_input_dims); - TORCH_CHECK(wrap_dim < static_cast(input_shape.size()), errMessage.c_str()) + TORCH_CHECK(wrap_dim < (num_input_dims ? num_input_dims : 1), errMessage.c_str()) } } @@ -615,9 +611,6 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t, } static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) { - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, nanmedian ? "nanmedian" : "median"); - IntArrayRef input_shape = input_t.sizes(); int64_t num_in_elements = c10::multiply_integers(input_shape); @@ -634,8 +627,7 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) { auto medianCachedGraph = LookUpOrCreateCachedGraph(medianKey, [&](auto mpsGraph, auto newCachedGraph) { MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); - MPSGraphTensor* castInputTensor = - castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus); + MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); MPSGraphTensor* reshapedTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil]; @@ -693,9 +685,6 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) { } static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction_type, const std::string& func_name) { - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "min_max"); - using CachedGraph = MPSUnaryCachedGraph; IntArrayRef input_shape = input_t.sizes(); @@ -713,8 +702,7 @@ static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); MPSGraphTensor* castOutputTensor = nil; - MPSGraphTensor* castInputTensor = - castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus); + MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); NSArray* axes = getTensorAxes(input_t); if (reduction_type == MPSReductionType::MAX) { @@ -749,9 +737,6 @@ static void min_max_out_mps(const Tensor& input_t, const Tensor& indices_t, MPSReductionType reduction_type, const std::string& func_name) { - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "min_max_out"); - if (output_t.numel() == 0) { return; } @@ -789,8 +774,7 @@ static void min_max_out_mps(const Tensor& input_t, auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); MPSGraphTensor* outputTensor = nil; - MPSGraphTensor* castInputTensor = - castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus); + MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); if (reduction_type == MPSReductionType::MAX) { outputTensor = [mpsGraph reductionMaximumPropagateNaNWithTensor:castInputTensor axis:(NSInteger)dim_ name:nil]; @@ -896,9 +880,6 @@ static void argmax_argmin_out_mps(const Tensor& input_t, const std::string& func_name) { using CachedGraph = MPSUnaryCachedGraph; - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "argmax_argmin_out"); - int64_t dim_ = -1; if (dim.has_value()) { @@ -953,7 +934,7 @@ static void argmax_argmin_out_mps(const Tensor& input_t, MPSGraphTensor* castInputTensor = inputTensor; if (inputScalarType != kInt && inputScalarType != kHalf && inputScalarType != kFloat && - (inputScalarType != kLong || !macOS13_3_plus)) { + inputScalarType != kLong) { castInputTensor = castMPSTensor(mpsGraph, inputTensor, kFloat); } if (reduction_type == MPSReductionType::MAX) { @@ -1282,9 +1263,6 @@ static void all_any_common_impl_mps(const Tensor& input_t, return; } - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, op_name); - int64_t dim_ = maybe_wrap_dim(dim, input_t.dim()); native::zero_numel_check_dims(input_t, dim_, op_name.c_str()); @@ -1303,7 +1281,7 @@ static void all_any_common_impl_mps(const Tensor& input_t, auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); - auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus); + auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); // reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4 // See https://github.com/pytorch/pytorch/issues/95538 MPSGraphTensor* outputTensor = nil; @@ -1369,14 +1347,11 @@ static void all_any_common_impl_mps(const Tensor& input_t, return; } - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "any_all_out"); - @autoreleasepool { std::string key = std::string("any_all_out_mps:") + getTensorsStringKey(input_t); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); - auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus); + auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); // reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4 // See https://github.com/pytorch/pytorch/issues/95538 if (input_t.dim() > 4) { @@ -1420,14 +1395,11 @@ static void all_any_common_impl_mps(const Tensor& input_t, return; } - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "all_all_out"); - @autoreleasepool { std::string key = std::string("all_all_out_mps:") + getTensorsStringKey(input_t); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); - auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus); + auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); // reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4 // See https://github.com/pytorch/pytorch/issues/95538 if (input_t.ndimension() > 4) { @@ -1512,9 +1484,6 @@ static void median_out_mps_common(const Tensor& input_t, Tensor& indices, const std::string& func_name, bool nanmedian) { - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "median_out"); - int64_t dim_ = maybe_wrap_dim(dim, input_t.dim()); native::zero_numel_check_dims(input_t, dim_, "max()"); @@ -1585,8 +1554,7 @@ static void median_out_mps_common(const Tensor& input_t, getTensorsStringKey(indices); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); - MPSGraphTensor* castInputTensor = - castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus); + MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); MPSGraphTensor* effectiveLengthTensor = nil; if (nanmedian) { diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm index 10668309a8c23..40afa15b4f700 100644 --- a/aten/src/ATen/native/mps/operations/Repeat.mm +++ b/aten/src/ATen/native/mps/operations/Repeat.mm @@ -129,16 +129,8 @@ void computeRepeatIndices(const index_t* repeat_ptr, }); } -Tensor repeat_interleave_mps(const Tensor& repeat_, std::optional output_size) { +Tensor repeat_interleave_mps(const Tensor& repeat, std::optional output_size) { Tensor output; - Tensor repeat = repeat_; - if (repeat.scalar_type() == kLong && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS)) { - // #103810551: `repeat_interleave_common` uses cumsum to calculate the final shape of output, - // which currently doesn't support int64_t as input. Casting internally the indices to int32_t. - TORCH_WARN_ONCE( - "MPS: no support for int64 repeats mask, casting it to int32. Support has been added in macOS 13.3"); - repeat = repeat.to(kInt); - } AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() { output = repeat_interleave_common>(repeat, output_size); }); diff --git a/aten/src/ATen/native/mps/operations/ScanKernel.mm b/aten/src/ATen/native/mps/operations/ScanKernel.mm index 9e3269d970143..80495ba9d501d 100644 --- a/aten/src/ATen/native/mps/operations/ScanKernel.mm +++ b/aten/src/ATen/native/mps/operations/ScanKernel.mm @@ -23,125 +23,6 @@ #include #endif -// Generic scan implementation that handles both simple scans and scans with indices -static void scan_mps_impl(const Tensor& self, - const std::vector& outputs, - int64_t dim, - const std::string& op_name) { - if (outputs[0].numel() == 0) { - return; - } - - const int64_t ndim = self.dim(); - const int64_t wrapped_dim = maybe_wrap_dim(dim, ndim); - - // Calculate dimensions for scan operation - int64_t row_size = self.size(wrapped_dim); - auto sizes = self.sizes(); - - bool is_innermost = (wrapped_dim == ndim - 1); - - // Check if all tensors are contiguous - bool is_contiguous = self.is_contiguous(); - for (const auto& output : outputs) { - is_contiguous = is_contiguous && output.is_contiguous(); - } - - uint32_t num_rows, num_orows, num_irows, num_threads; - - if (is_innermost) { - // Treat all outer dimensions as a single dimension - num_rows = self.numel() / row_size; - num_threads = num_rows; - } else { - // Treat all outer dimensions (i.e. dim_ < dim) as one - num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + wrapped_dim); - // Treat all inner dimensions (i.e. dim > dimension) as one - num_irows = c10::multiply_integers(sizes.begin() + wrapped_dim + 1, sizes.end()); - num_threads = num_orows * num_irows; - } - - MPSStream* mpsStream = getCurrentMPSStream(); - dispatch_sync_with_rethrow(mpsStream->queue(), ^() { - @autoreleasepool { - id computeEncoder = mpsStream->commandEncoder(); - - // Choose kernel based on contiguity and dimension - std::string kernel_name; - if (is_contiguous) { - kernel_name = - op_name + "_contiguous_" + (is_innermost ? "innermost_" : "outer_") + scalarToMetalTypeString(self); - } else { - kernel_name = op_name + "_strided_" + scalarToMetalTypeString(self); - } - - id scanPSO = lib.getPipelineStateForFunc(kernel_name); - - // this function call is a no-op if MPS Profiler is not enabled - getMPSProfiler().beginProfileKernel(scanPSO, op_name, [&]() { - std::vector all_tensors = {self}; - all_tensors.insert(all_tensors.end(), outputs.begin(), outputs.end()); - return all_tensors; - }()); - - [computeEncoder setComputePipelineState:scanPSO]; - - // Set input tensor - mtl_setBuffer(computeEncoder, self, 0); - - // Set output tensors - for (size_t i = 0; i < outputs.size(); ++i) { - mtl_setBuffer(computeEncoder, outputs[i], i + 1); - } - - if (is_contiguous) { - // Contiguous kernels - if (is_innermost) { - if (outputs.size() == 1) { - // Simple scan - mtl_setArgs<2>(computeEncoder, num_rows, static_cast(row_size)); - } else { - // Scan with indices - mtl_setArgs<3>(computeEncoder, num_rows, static_cast(row_size)); - } - } else { - if (outputs.size() == 1) { - // Simple scan - mtl_setArgs<2>(computeEncoder, num_orows, num_irows, static_cast(row_size)); - } else { - // Scan with indices - mtl_setArgs<3>(computeEncoder, num_orows, num_irows, static_cast(row_size)); - } - } - } else { - // Strided kernels - pass full tensor information - if (outputs.size() == 1) { - // Simple scan - mtl_setArgs<2>(computeEncoder, - self.sizes(), - self.strides(), - outputs[0].strides(), - static_cast(self.ndimension()), - static_cast(wrapped_dim)); - } else { - // Scan with indices - mtl_setArgs<3>(computeEncoder, - self.sizes(), - self.strides(), - outputs[0].strides(), - outputs[1].strides(), - static_cast(self.ndimension()), - static_cast(wrapped_dim)); - } - } - - mtl_dispatch1DJob(computeEncoder, scanPSO, num_threads); - - getMPSProfiler().endProfileKernel(scanPSO); - } - }); -} - // Utility function to get 2D grid dimensions for dispatch static std::pair get_2d_grid_dims(const IntArrayRef& shape, const int64_t dim) { size_t grid_x = 1; @@ -375,19 +256,11 @@ static void scan_with_indices_mps_impl(const Tensor& self, } // namespace mps void cummax_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) { - if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) { - mps::scan_with_indices_mps_impl(self, values, indices, dim, "cummax"); - } else { - mps::scan_mps_impl(self, {values, indices}, dim, "cummax"); - } + mps::scan_with_indices_mps_impl(self, values, indices, dim, "cummax"); } void cummin_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) { - if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) { - mps::scan_with_indices_mps_impl(self, values, indices, dim, "cummin"); - } else { - mps::scan_mps_impl(self, {values, indices}, dim, "cummin"); - } + mps::scan_with_indices_mps_impl(self, values, indices, dim, "cummin"); } Tensor& _logcumsumexp_out_mps(const Tensor& self, int64_t dim, Tensor& result) { @@ -402,11 +275,7 @@ void cummin_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int6 return result; } - if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) { - mps::scan_simple_mps_impl(self, result, wrap_dim, "logcumsumexp"); - } else { - mps::scan_mps_impl(self, {result}, wrap_dim, "logcumsumexp"); - } + mps::scan_simple_mps_impl(self, result, wrap_dim, "logcumsumexp"); return result; } diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm index c73b7c33098f1..6ff47044df133 100644 --- a/aten/src/ATen/native/mps/operations/Sort.mm +++ b/aten/src/ATen/native/mps/operations/Sort.mm @@ -2,6 +2,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #include #include #include @@ -11,10 +12,85 @@ #include #include #else +#include #include #include #endif namespace at::native { +namespace { + +void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) { + using namespace mps; + if (self.dim() == 0 && self.numel() == 1) { + values.copy_(self); + indices.zero_(); + return; + } + // Handle empty tensors + if (self.numel() == 0) { + values.copy_(self); + indices.copy_(values.toType(at::ScalarType::Long)); + return; + } + // issue #154890, raising error to prevent crash within MPSGraph until + // workaround is implemented. + TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890"); + + auto stream = getCurrentMPSStream(); + struct CachedGraph : public MPSCachedGraph { + CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil; + }; + + // MPSGraph kthvalue is always sorted. + @autoreleasepool { + // Input as placeholders + MPSShape* input_shape = getMPSShape(self); + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + std::string key = std::string("kthvalue:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" + + std::to_string(k) + ":dim" + std::to_string(dim); + auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { + newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape); + + MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor; + MPSDataType dataType = getMPSDataType(self); + // #issue 104398441 sortWithTensor and argsortWithTensor + if (dataType != MPSDataTypeInt32 && dataType != MPSDataTypeFloat32 && dataType != MPSDataTypeFloat16) { + dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32; + castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor toType:dataType name:@"castInputTensor"]; + } + MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor + axis:(NSUInteger)dim + descending:false + name:nil]; + sortedTensor = [mpsGraph sliceTensor:sortedTensor + dimension:(NSUInteger)dim + start:((NSUInteger)k - 1) + length:1 + name:nil]; + MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor + axis:(NSInteger)dim + descending:false + name:@"kthvalue_out"]; + argSortedTensor = [mpsGraph sliceTensor:argSortedTensor + dimension:dim + start:((NSUInteger)k - 1) + length:1 + name:nil]; + newCachedGraph->valuesTensor = sortedTensor; + newCachedGraph->indicesTensor = argSortedTensor; + }); + Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self); + // Outputs as placeholders + Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values); + Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices); + // Create dictionary of inputs and outputs + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} +} // anonymous namespace // sort TORCH_IMPL_FUNC(sort_stable_out_mps) @@ -26,9 +102,6 @@ const Tensor& indices) { using namespace mps; - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); - MPS_CHECK_INT64_OP_SUPPORTED(self, macOS13_3_plus, "sort_stable_out"); - if (self.numel() == 0) { return; } @@ -55,8 +128,7 @@ auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape); - MPSGraphTensor* castInputTensor = - castToIHFTypes(mpsGraph, newCachedGraph->selfTensor, self, /*includesInt64=*/macOS13_3_plus); + MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, newCachedGraph->selfTensor, self); MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor axis:(NSInteger)dim descending:(BOOL)descending @@ -85,4 +157,31 @@ runMPSGraph(stream, cachedGraph->graph(), feeds, results); } } + +std::tuple kthvalue_out_mps(const Tensor& self, + int64_t k, + int64_t dim_, + bool keepdim, + Tensor& values, + Tensor& indices) { + // See note [Writing Nondeterministic Operations] + // If there are duplicate elements of the kth value, the procedure for choosing which + // of the duplicates to use for the indices output is nondeterministic. + at::globalContext().alertNotDeterministic("kthvalue MPS"); + + int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); + int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim); + TORCH_CHECK(k >= 1 && k <= slicesize, "kthvalue(): selected number k out of range for dimension ", dim); + at::assert_no_overlap(self, values); + _reduction_with_indices_allocate_or_resize_output(values, indices, self, dim, keepdim); + + kthvalue_out_mps_impl(self, k, dim, values, indices); + + if (!keepdim) { + values.squeeze_(dim); + indices.squeeze_(dim); + } + + return std::forward_as_tuple(values, indices); +} } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm index 6e030c99d0356..7b637d896f850 100644 --- a/aten/src/ATen/native/mps/operations/TensorCompare.mm +++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm @@ -297,9 +297,6 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements, const auto common_type = at::result_type(elements, test_elements); TORCH_CHECK(elements.is_mps() && test_elements.is_mps()); - TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) || supportedFloatingType(common_type), - "isin_Tensor_Tensor_out only works on floating types on MPS for pre MacOS_14_0. Received dtype: ", - common_type); @autoreleasepool { std::string key = op_name + getTensorsStringKey({elements, test_elements}) + std::to_string(invert); @@ -338,6 +335,9 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements, } static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) { + if (iter.numel() == 0) { + return; + } const auto& self = iter.input(0); auto& out = iter.output(0); @autoreleasepool { diff --git a/aten/src/ATen/native/mps/operations/UnaryKernel.mm b/aten/src/ATen/native/mps/operations/UnaryKernel.mm index b560739ed40c3..7e150b133cc65 100644 --- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm +++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm @@ -50,6 +50,7 @@ static void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals) { REGISTER_UNARY_TI_DISPATCH(log); REGISTER_UNARY_TI_DISPATCH(log1p); REGISTER_UNARY_TI_DISPATCH(bitwise_not); +REGISTER_UNARY_TI_DISPATCH(round); REGISTER_UNARY_TI_DISPATCH(sigmoid); REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel); } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm index edf45a5ff80d0..d7ce40e5cbb4f 100644 --- a/aten/src/ATen/native/mps/operations/UnaryOps.mm +++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm @@ -184,7 +184,6 @@ static void unary_op(const Tensor& self, REGISTER_MPS_UNARY_STUB(ceil, ceil); REGISTER_MPS_UNARY_STUB(floor, floor); -REGISTER_MPS_UNARY_STUB(round, round); REGISTER_MPS_UNARY_STUB(trunc, truncate); #define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub) \ @@ -208,28 +207,12 @@ static void unary_op(const Tensor& self, } Tensor& angle_out_mps(const Tensor& self, Tensor& output) { - if (mps::supportsComplex()) { - mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { - auto realPart = [mpsGraph realPartOfTensor:inputTensor name:nil]; - auto imagPart = [mpsGraph imaginaryPartOfTensor:inputTensor name:nil]; - return [mpsGraph atan2WithPrimaryTensor:imagPart secondaryTensor:realPart name:nil]; - }); - return output; - } else { - TORCH_CHECK(!self.is_complex(), "MPS does not support angle with complex input on macOS13") - mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { - // On macOS 13 with non-complex input, realPartOfTensor and imaginaryPartOfTensor are - // not available, and NaN is not propagated correctly: - auto imagPart = [mpsGraph constantWithScalar:0.0 shape:inputTensor.shape dataType:inputTensor.dataType]; - auto result = [mpsGraph atan2WithPrimaryTensor:imagPart secondaryTensor:inputTensor name:nil]; - auto nanMask = [mpsGraph isNaNWithTensor:inputTensor name:nil]; - return [mpsGraph selectWithPredicateTensor:nanMask - truePredicateTensor:inputTensor - falsePredicateTensor:result - name:nil]; - }); - return output; - } + mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { + auto realPart = [mpsGraph realPartOfTensor:inputTensor name:nil]; + auto imagPart = [mpsGraph imaginaryPartOfTensor:inputTensor name:nil]; + return [mpsGraph atan2WithPrimaryTensor:imagPart secondaryTensor:realPart name:nil]; + }); + return output; } Tensor angle_mps(const Tensor& self) { @@ -362,7 +345,6 @@ static void cumulative_op_impl(const Tensor& self, const Tensor& result, MPSCumulativeOpType cumulativeOpType, const std::string& op_name) { - bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); auto nDims = self.dim(); auto wrapped_dim = maybe_wrap_dim(dim, nDims); TORCH_CHECK(wrapped_dim >= 0 && wrapped_dim < std::max(1LL, self.ndimension()), @@ -381,11 +363,6 @@ static void cumulative_op_impl(const Tensor& self, bool castInputData = (isIntegralType(input.scalar_type(), true) && input.scalar_type() != ScalarType::Int && input.scalar_type() != ScalarType::Long); - TORCH_CHECK(macOS13_3_plus || input.scalar_type() != ScalarType::Long, - "MPS does not support ", - op_name, - " op with int64 input. Support has been added in macOS 13.3"); - mps::unary_op( input, result, op_name + std::to_string(dim), ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { if (castInputData) { @@ -440,17 +417,10 @@ static void cumulative_op_impl(const Tensor& self, Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) { TORCH_CHECK(self.is_complex()); - if (!mps::supportsComplex()) { - if (!result.is_same_size(self)) { - result.resize_(self.sizes()); - } - at::real(result).copy_(at::real(self)); - at::imag(result).copy_(at::neg(at::imag(self))); - } else { - mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { - return [mpsGraph conjugateWithTensor:inputTensor name:nil]; - }); - } + TORCH_CHECK(self.dtype() != at::kComplexDouble); + mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { + return [mpsGraph conjugateWithTensor:inputTensor name:nil]; + }); return result; } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index db8eef9349642..abb061afc5c95 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -288,6 +288,7 @@ dispatch: CPU: native_dropout_cpu CUDA: native_dropout_cuda + MPS: native_dropout_mps NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested tags: [nondeterministic_seeded, core] autogen: native_dropout.out @@ -296,6 +297,7 @@ dispatch: CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward CUDA: native_dropout_backward_cuda + MPS: native_dropout_backward_mps autogen: native_dropout_backward.out tags: pointwise @@ -340,8 +342,8 @@ variants: function, method dispatch: CompositeExplicitAutograd: abs - SparseCPU, SparseCUDA: abs_sparse - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr + SparseCPU, SparseCUDA, SparseMPS: abs_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs tags: [core, pointwise] @@ -350,16 +352,16 @@ variants: function, method dispatch: CompositeExplicitAutograd: abs_ - SparseCPU, SparseCUDA: abs_sparse_ - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_ + SparseCPU, SparseCUDA, SparseMPS: abs_sparse_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_ - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CPU, CUDA, MPS, MTIA: abs_out - SparseCPU, SparseCUDA: abs_sparse_out - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out + SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out tags: pointwise # Note [Adding an alias] @@ -428,7 +430,7 @@ variants: function, method structured_delegate: sgn.out dispatch: - SparseCPU, SparseCUDA: sgn_sparse + SparseCPU, SparseCUDA, SparseMPS: sgn_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn tags: pointwise @@ -437,7 +439,7 @@ variants: method structured_delegate: sgn.out dispatch: - SparseCPU, SparseCUDA: sgn_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_ tags: pointwise @@ -448,7 +450,7 @@ dispatch: CPU, CUDA: sgn_out MPS: sgn_out_mps - SparseCPU, SparseCUDA: sgn_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out tags: pointwise @@ -476,7 +478,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: _conj_physical - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr autogen: _conj_physical.out - func: conj_physical(Tensor self) -> Tensor @@ -487,8 +489,8 @@ dispatch: CPU, CUDA: conj_physical_out MPS: conj_physical_out_mps - SparseCPU, SparseCUDA: conj_physical_out_sparse - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out + SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out tags: pointwise - func: conj_physical_(Tensor(a!) self) -> Tensor(a!) @@ -554,7 +556,7 @@ structured_delegate: add.out variants: function, method dispatch: - SparseCPU, SparseCUDA, SparseMeta: add_sparse + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr MkldnnCPU: mkldnn_add ZeroTensor: add_zerotensor @@ -566,7 +568,7 @@ variants: method structured_delegate: add.out dispatch: - SparseCPU, SparseCUDA, SparseMeta: add_sparse_ + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_ MkldnnCPU: mkldnn_add_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor @@ -582,6 +584,7 @@ dispatch: SparseCPU, SparseMeta: add_out_sparse_cpu SparseCUDA: add_out_sparse_cuda + SparseMPS: add_out_sparse_mps SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu SparseCsrCUDA: add_out_sparse_compressed_cuda MkldnnCPU: mkldnn_add_out @@ -874,7 +877,7 @@ variants: function, method structured_delegate: asinh.out dispatch: - SparseCPU, SparseCUDA: asinh_sparse + SparseCPU, SparseCUDA, SparseMPS: asinh_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr tags: [core, pointwise] @@ -882,7 +885,7 @@ variants: function, method structured_delegate: asinh.out dispatch: - SparseCPU, SparseCUDA: asinh_sparse_ + SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_ tags: pointwise @@ -892,7 +895,7 @@ dispatch: CPU, CUDA: asinh_out MPS: asinh_out_mps - SparseCPU, SparseCUDA: asinh_sparse_out + SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out tags: pointwise @@ -909,7 +912,7 @@ structured_delegate: atanh.out variants: function, method dispatch: - SparseCPU, SparseCUDA: atanh_sparse + SparseCPU, SparseCUDA, SparseMPS: atanh_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr tags: [core, pointwise] @@ -917,7 +920,7 @@ structured_delegate: atanh.out variants: function, method dispatch: - SparseCPU, SparseCUDA: atanh_sparse_ + SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_ tags: pointwise @@ -927,7 +930,7 @@ dispatch: CPU, CUDA: atanh_out MPS: atanh_out_mps - SparseCPU, SparseCUDA: atanh_sparse_out + SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out tags: pointwise # arctanh, alias for atanh @@ -964,7 +967,7 @@ variants: function, method structured_delegate: asin.out dispatch: - SparseCPU, SparseCUDA: asin_sparse + SparseCPU, SparseCUDA, SparseMPS: asin_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr tags: [core, pointwise] @@ -973,7 +976,7 @@ variants: function, method structured_delegate: asin.out dispatch: - SparseCPU, SparseCUDA: asin_sparse_ + SparseCPU, SparseCUDA, SparseMPS: asin_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_ tags: pointwise @@ -983,7 +986,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: asin_out - SparseCPU, SparseCUDA: asin_sparse_out + SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out tags: pointwise @@ -1001,7 +1004,7 @@ structured_delegate: atan.out variants: function, method dispatch: - SparseCPU, SparseCUDA: atan_sparse + SparseCPU, SparseCUDA, SparseMPS: atan_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr tags: [core, pointwise] @@ -1010,7 +1013,7 @@ structured_delegate: atan.out variants: function, method dispatch: - SparseCPU, SparseCUDA: atan_sparse_ + SparseCPU, SparseCUDA, SparseMPS: atan_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_ tags: pointwise @@ -1020,7 +1023,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: atan_out - SparseCPU, SparseCUDA: atan_sparse_out + SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out tags: pointwise @@ -1459,7 +1462,7 @@ structured_delegate: ceil.out variants: function, method dispatch: - SparseCPU, SparseCUDA: ceil_sparse + SparseCPU, SparseCUDA, SparseMPS: ceil_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr tags: [core, pointwise] @@ -1468,7 +1471,7 @@ structured_delegate: ceil.out variants: function, method dispatch: - SparseCPU, SparseCUDA: ceil_sparse_ + SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_ tags: pointwise @@ -1478,7 +1481,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: ceil_out - SparseCPU, SparseCUDA: ceil_sparse_out + SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out tags: pointwise @@ -2406,7 +2409,7 @@ MPS: empty_mps Meta: empty_meta_symint MkldnnCPU: empty_mkldnn - SparseCPU, SparseCUDA: empty_sparse + SparseCPU, SparseCUDA, SparseMPS: empty_sparse SparseMeta: empty_sparse_symint SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed SparseCsrMeta: empty_sparse_compressed_symint @@ -2534,7 +2537,7 @@ structured_delegate: erf.out variants: function, method dispatch: - SparseCPU, SparseCUDA: erf_sparse + SparseCPU, SparseCUDA, SparseMPS: erf_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr tags: [core, pointwise] @@ -2543,7 +2546,7 @@ structured_delegate: erf.out variants: function, method dispatch: - SparseCPU, SparseCUDA: erf_sparse_ + SparseCPU, SparseCUDA, SparseMPS: erf_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_ tags: pointwise @@ -2553,7 +2556,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS, MTIA: erf_out - SparseCPU, SparseCUDA: erf_sparse_out + SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out tags: pointwise @@ -2619,7 +2622,7 @@ structured_delegate: expm1.out variants: function, method dispatch: - SparseCPU, SparseCUDA: expm1_sparse + SparseCPU, SparseCUDA, SparseMPS: expm1_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr tags: [core, pointwise] @@ -2628,7 +2631,7 @@ structured_delegate: expm1.out variants: function, method dispatch: - SparseCPU, SparseCUDA: expm1_sparse_ + SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_ tags: pointwise @@ -2638,7 +2641,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: expm1_out - SparseCPU, SparseCUDA: expm1_sparse_out + SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out tags: pointwise @@ -2737,7 +2740,7 @@ structured_delegate: floor.out variants: function, method dispatch: - SparseCPU, SparseCUDA: floor_sparse + SparseCPU, SparseCUDA, SparseMPS: floor_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr tags: [core, pointwise] @@ -2746,7 +2749,7 @@ structured_delegate: floor.out variants: function, method dispatch: - SparseCPU, SparseCUDA: floor_sparse_ + SparseCPU, SparseCUDA, SparseMPS: floor_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_ tags: pointwise @@ -2756,7 +2759,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: floor_out - SparseCPU, SparseCUDA: floor_sparse_out + SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out tags: pointwise @@ -2764,7 +2767,7 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA, MPS: floor_divide + CPU, CUDA, MPS, MTIA: floor_divide SparseCPU, SparseCUDA: floor_divide_sparse - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) @@ -2798,7 +2801,7 @@ structured_delegate: frac.out variants: function, method dispatch: - SparseCPU, SparseCUDA: frac_sparse + SparseCPU, SparseCUDA, SparseMPS: frac_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr tags: pointwise @@ -2807,7 +2810,7 @@ structured_delegate: frac.out variants: function, method dispatch: - SparseCPU, SparseCUDA: frac_sparse_ + SparseCPU, SparseCUDA, SparseMPS: frac_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_ tags: pointwise @@ -2818,7 +2821,7 @@ dispatch: CPU, CUDA: frac_out MPS: frac_out_mps - SparseCPU, SparseCUDA: frac_sparse_out + SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out tags: pointwise @@ -2931,6 +2934,7 @@ dispatch: CPU: grid_sampler_3d_cpu CUDA: grid_sampler_3d_cuda + MPS: grid_sampler_3d_mps autogen: grid_sampler_3d.out # `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for @@ -3207,7 +3211,7 @@ dispatch: CPU, CUDA, MPS, MTIA: isnan NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan - SparseCPU, SparseCUDA: isnan_sparse + SparseCPU, SparseCUDA, SparseMPS: isnan_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr autogen: isnan.out tags: [core, pointwise] @@ -3288,6 +3292,7 @@ dispatch: CPU: kthvalue_out_cpu CUDA: kthvalue_out_cuda + MPS: kthvalue_out_mps - func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method @@ -3335,21 +3340,21 @@ variants: function, method dispatch: CompositeExplicitAutograd: nan_to_num - SparseCPU, SparseCUDA: nan_to_num_sparse + SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse tags: pointwise - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!) variants: function, method dispatch: CompositeExplicitAutograd: nan_to_num_ - SparseCPU, SparseCUDA: nan_to_num_sparse_ + SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_ tags: pointwise - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA, MTIA: nan_to_num_out MPS: nan_to_num_out_mps - SparseCPU, SparseCUDA: nan_to_num_sparse_out + SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out tags: pointwise - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor @@ -3447,8 +3452,12 @@ - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor? bias) -> Tensor +- func: fbgemm_linear_fp16_weight_fp32_activation.out(Tensor input, Tensor packed_weight, Tensor? bias, Tensor(a!) output) -> Tensor + - func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor +- func: fbgemm_linear_fp16_weight.out(Tensor input, Tensor packed_weight, Tensor bias, Tensor(a!) output) -> Tensor + - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor @@ -3548,7 +3557,7 @@ structured_delegate: log1p.out variants: function, method dispatch: - SparseCPU, SparseCUDA: log1p_sparse + SparseCPU, SparseCUDA, SparseMPS: log1p_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr tags: [core, pointwise] @@ -3557,7 +3566,7 @@ structured_delegate: log1p.out variants: function, method dispatch: - SparseCPU, SparseCUDA: log1p_sparse_ + SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_ tags: pointwise @@ -3567,7 +3576,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: log1p_out - SparseCPU, SparseCUDA: log1p_sparse_out + SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out tags: pointwise @@ -4230,6 +4239,7 @@ - func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor dispatch: CPU: _weight_int8pack_mm_cpu + CUDA: _weight_int8pack_mm_cuda MPS: _weight_int8pack_mm_mps - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor @@ -4658,7 +4668,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: rad2deg - SparseCPU, SparseCUDA: rad2deg_sparse + SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr tags: pointwise @@ -4666,14 +4676,14 @@ variants: function, method dispatch: CompositeExplicitAutograd: rad2deg_ - SparseCPU, SparseCUDA: rad2deg_sparse_ + SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_ tags: pointwise - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: rad2deg_out - SparseCPU, SparseCUDA: rad2deg_sparse_out + SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out tags: pointwise @@ -4681,7 +4691,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: deg2rad - SparseCPU, SparseCUDA: deg2rad_sparse + SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr tags: pointwise @@ -4689,14 +4699,14 @@ variants: function, method dispatch: CompositeExplicitAutograd: deg2rad_ - SparseCPU, SparseCUDA: deg2rad_sparse_ + SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_ tags: pointwise - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: deg2rad_out - SparseCPU, SparseCUDA: deg2rad_sparse_out + SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out tags: pointwise @@ -4922,7 +4932,7 @@ structured_delegate: neg.out variants: function, method dispatch: - SparseCPU, SparseCUDA: neg_sparse + SparseCPU, SparseCUDA, SparseMPS: neg_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg tags: [core, pointwise] @@ -4932,7 +4942,7 @@ structured_delegate: neg.out variants: function, method dispatch: - SparseCPU, SparseCUDA: neg_sparse_ + SparseCPU, SparseCUDA, SparseMPS: neg_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_ tags: pointwise @@ -4943,7 +4953,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS, MTIA: neg_out - SparseCPU, SparseCUDA: neg_out_sparse + SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out tags: pointwise # Alias for neg @@ -5027,7 +5037,7 @@ structured_delegate: round.out variants: function, method dispatch: - SparseCPU, SparseCUDA: round_sparse + SparseCPU, SparseCUDA, SparseMPS: round_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr tags: [core, pointwise] @@ -5036,7 +5046,7 @@ structured_delegate: round.out variants: function, method dispatch: - SparseCPU, SparseCUDA: round_sparse_ + SparseCPU, SparseCUDA, SparseMPS: round_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_ tags: pointwise @@ -5046,7 +5056,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: round_out - SparseCPU, SparseCUDA: round_sparse_out + SparseCPU, SparseCUDA, SparseMPS: round_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out tags: pointwise @@ -5089,7 +5099,7 @@ QuantizedCPU: relu_quantized_cpu QuantizedCUDA: relu_quantized_cuda NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu - SparseCPU, SparseCUDA: relu_sparse + SparseCPU, SparseCUDA, SparseMPS: relu_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr tags: [core, pointwise] @@ -5104,7 +5114,7 @@ QuantizedCPU: relu_quantized_cpu_ QuantizedCUDA: relu_quantized_cuda_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_ - SparseCPU, SparseCUDA: relu_sparse_ + SparseCPU, SparseCUDA, SparseMPS: relu_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_ autogen: relu.out tags: pointwise @@ -5391,7 +5401,7 @@ variants: function, method dispatch: SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr - SparseCPU, SparseCUDA: sin_sparse + SparseCPU, SparseCUDA, SparseMPS: sin_sparse NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin tags: [core, pointwise] @@ -5401,7 +5411,7 @@ variants: function, method dispatch: SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_ - SparseCPU, SparseCUDA: sin_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sin_sparse_ tags: pointwise - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -5411,7 +5421,7 @@ dispatch: CPU, CUDA, MPS, MTIA: sin_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out - SparseCPU, SparseCUDA: sin_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out tags: pointwise - func: sinc(Tensor self) -> Tensor @@ -5436,7 +5446,7 @@ structured_delegate: sinh.out variants: function, method dispatch: - SparseCPU, SparseCUDA: sinh_sparse + SparseCPU, SparseCUDA, SparseMPS: sinh_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr tags: [core, pointwise] @@ -5445,7 +5455,7 @@ structured_delegate: sinh.out variants: function, method dispatch: - SparseCPU, SparseCUDA: sinh_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_ tags: pointwise @@ -5455,7 +5465,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: sinh_out - SparseCPU, SparseCUDA: sinh_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out # Returns a copy of this `Variable` that is detached from its autograd graph. @@ -5503,6 +5513,13 @@ tags: core manual_cpp_binding: True +- func: sym_is_contiguous(Tensor self, MemoryFormat memory_format=contiguous_format) -> SymBool + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + - func: sym_numel(Tensor self) -> SymInt variants: function device_check: NoCheck @@ -5898,7 +5915,7 @@ variants: function, method dispatch: NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt - SparseCPU, SparseCUDA: sqrt_sparse + SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr tags: [core, pointwise] @@ -5907,7 +5924,7 @@ structured_delegate: sqrt.out variants: function, method dispatch: - SparseCPU, SparseCUDA: sqrt_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_ tags: pointwise @@ -5917,7 +5934,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS, MTIA: sqrt_out - SparseCPU, SparseCUDA: sqrt_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out tags: pointwise @@ -6055,7 +6072,7 @@ structured_delegate: tan.out variants: function, method dispatch: - SparseCPU, SparseCUDA: tan_sparse + SparseCPU, SparseCUDA, SparseMPS: tan_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr tags: [core, pointwise] @@ -6064,7 +6081,7 @@ structured_delegate: tan.out variants: function, method dispatch: - SparseCPU, SparseCUDA: tan_sparse_ + SparseCPU, SparseCUDA, SparseMPS: tan_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_ tags: pointwise @@ -6074,7 +6091,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: tan_out - SparseCPU, SparseCUDA: tan_sparse_out + SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out tags: pointwise @@ -6085,7 +6102,7 @@ dispatch: QuantizedCPU: tanh_quantized_cpu MkldnnCPU: mkldnn_tanh - SparseCPU, SparseCUDA: tanh_sparse + SparseCPU, SparseCUDA, SparseMPS: tanh_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh tags: [core, pointwise] @@ -6096,7 +6113,7 @@ variants: function, method dispatch: MkldnnCPU: mkldnn_tanh_ - SparseCPU, SparseCUDA: tanh_sparse_ + SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_ tags: pointwise @@ -6107,7 +6124,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS, MTIA: tanh_out - SparseCPU, SparseCUDA: tanh_sparse_out + SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out tags: pointwise @@ -6379,8 +6396,8 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: - SparseCPU, SparseCUDA: trunc_sparse - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr + SparseCPU, SparseCUDA, SparseMPS: trunc_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr tags: [core, pointwise] - func: trunc_(Tensor(a!) self) -> Tensor(a!) @@ -6388,8 +6405,8 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: - SparseCPU, SparseCUDA: trunc_sparse_ - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_ + SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_ tags: pointwise - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -6398,8 +6415,8 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA, MPS: trunc_out - SparseCPU, SparseCUDA: trunc_sparse_out - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out + SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out tags: pointwise # Alias for trunc @@ -6909,7 +6926,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: clone - SparseCPU, SparseCUDA: clone_sparse + SparseCPU, SparseCUDA, SparseMPS: clone_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed MkldnnCPU: mkldnn_clone QuantizedCPU, QuantizedCUDA: quantized_clone @@ -6944,7 +6961,7 @@ CPU, CUDA: zero_ MPS: zero_mps_ Meta: zero_meta_ - SparseCPU, SparseCUDA, SparseMeta: zero_sparse_ + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zero_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_ MkldnnCPU: mkldnn_zero_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_ @@ -7150,6 +7167,7 @@ - func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor variants: function dispatch: + CompositeExplicitAutograd: _grouped_mm CUDA: _grouped_mm_cuda # NOTE [ Sparse: autograd and API ] @@ -7361,8 +7379,8 @@ - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor variants: method dispatch: - SparseCPU, SparseCUDA: sparse_to_dense - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense + SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense MkldnnCPU: mkldnn_to_dense autogen: _to_dense.out @@ -7388,8 +7406,8 @@ - func: dense_dim(Tensor self) -> int variants: method dispatch: - SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr CompositeExplicitAutograd: dense_dim_default device_check: NoCheck device_guard: False @@ -7422,6 +7440,7 @@ dispatch: SparseCPU: _coalesce_sparse_cpu SparseCUDA: _coalesce_sparse_cuda + SparseMPS: _coalesce_sparse_mps autogen: _coalesce.out - func: is_coalesced(Tensor self) -> bool @@ -7460,7 +7479,7 @@ - func: indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCPU, SparseCUDA, SparseMeta: indices_sparse + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: indices_sparse CompositeExplicitAutograd: indices_default device_check: NoCheck device_guard: False @@ -7468,7 +7487,7 @@ - func: values(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCPU, SparseCUDA, SparseMeta: values_sparse + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: values_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: values_nested CompositeExplicitAutograd: values_default @@ -7521,7 +7540,7 @@ device_check: NoCheck # Allows copy into different device variants: function dispatch: - SparseCPU, SparseCUDA, SparseMeta: copy_sparse_ + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_ autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors @@ -9712,7 +9731,7 @@ structured_delegate: sign.out variants: function, method dispatch: - SparseCPU, SparseCUDA: sign_sparse + SparseCPU, SparseCUDA, SparseMPS: sign_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr tags: [core, pointwise] @@ -9721,7 +9740,7 @@ structured_delegate: sign.out variants: method dispatch: - SparseCPU, SparseCUDA: sign_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sign_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_ tags: pointwise @@ -9732,7 +9751,7 @@ dispatch: CPU, CUDA: sign_out MPS: sign_out_mps - SparseCPU, SparseCUDA: sign_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out tags: pointwise @@ -9740,7 +9759,7 @@ variants: function, method structured_delegate: signbit.out dispatch: - SparseCPU, SparseCUDA: signbit_sparse + SparseCPU, SparseCUDA, SparseMPS: signbit_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr tags: pointwise @@ -9751,7 +9770,7 @@ CPU: signbit_out CUDA: signbit_out MPS: signbit_out_mps - SparseCPU, SparseCUDA: signbit_sparse_out + SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out tags: pointwise @@ -9934,7 +9953,7 @@ structured: True structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: igamma_out + CPU, CUDA, MPS: igamma_out tags: pointwise - func: igamma(Tensor self, Tensor other) -> Tensor @@ -9951,7 +9970,7 @@ structured: True structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: igammac_out + CPU, CUDA, MPS: igammac_out tags: pointwise - func: igammac(Tensor self, Tensor other) -> Tensor @@ -13255,7 +13274,7 @@ dispatch: CompositeExplicitAutograd: isinf NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf - SparseCPU, SparseCUDA: isinf_sparse + SparseCPU, SparseCUDA, SparseMPS: isinf_sparse SparseMeta: isinf_sparse_meta SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr autogen: isinf.out @@ -13271,7 +13290,7 @@ structured_delegate: isposinf.out dispatch: NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf - SparseCPU, SparseCUDA: isposinf_sparse + SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr tags: pointwise @@ -13280,7 +13299,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: isposinf_out - SparseCPU, SparseCUDA: isposinf_sparse_out + SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out tags: pointwise @@ -13289,7 +13308,7 @@ structured_delegate: isneginf.out dispatch: NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf - SparseCPU, SparseCUDA: isneginf_sparse + SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr tags: pointwise @@ -13298,7 +13317,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: isneginf_out - SparseCPU, SparseCUDA: isneginf_sparse_out + SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out tags: pointwise @@ -15011,6 +15030,7 @@ - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor) dispatch: CUDA: _scaled_dot_product_cudnn_attention_backward_cuda + NestedTensorCUDA: _scaled_dot_product_cudnn_attention_nestedtensor_backward_cuda tags: nondeterministic_seeded - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask) @@ -15043,6 +15063,11 @@ CUDA: _cudnn_attention_forward tags: nondeterministic_seeded +- func: _cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: _cudnn_attention_backward + tags: nondeterministic_seeded + - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor variants: function dispatch: diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp index 5b7476453407e..96c6ab8310f80 100644 --- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp +++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp @@ -349,6 +349,63 @@ _scaled_dot_product_cudnn_attention_nestedtensor_cuda( return std::make_tuple(std::move(attention), std::move(log_sumexp), cumulative_sequence_length_q, cumulative_sequence_length_kv, max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor()); } +std::tuple _scaled_dot_product_cudnn_attention_nestedtensor_backward_cuda( + const Tensor& grad_out, + const Tensor& query, + const Tensor& key, + const Tensor& value, + const Tensor& out, + const Tensor& logsumexp, + const Tensor& philox_seed, + const Tensor& philox_offset, + const Tensor& attn_bias, + const Tensor& cum_seq_q, + const Tensor& cum_seq_k, + const int64_t max_q, + const int64_t max_k, + double dropout_p, + bool is_causal, + std::optional scale) { + if (!grad_out.defined()) { + return std::make_tuple(Tensor{}, Tensor{}, Tensor{}); + } + auto [ + grad_out_buffer_reshaped, + query_buffer_reshaped, + key_buffer_reshaped, + value_buffer_reshaped, + output_buffer_reshaped] = + preprocessing::sdpa_nested_preprocessing_backward( + grad_out, + query, + key, + value, + out, + cum_seq_q, + cum_seq_k, + max_q, + max_k); + + auto [dq, dk, dv] = at::_cudnn_attention_backward(grad_out_buffer_reshaped, + query_buffer_reshaped, + key_buffer_reshaped, + value_buffer_reshaped, + output_buffer_reshaped, + logsumexp, + philox_seed, + philox_offset, + attn_bias, + cum_seq_q, + cum_seq_k, + max_q, + max_k, + dropout_p, + is_causal, + scale); + return std::make_tuple(std::move(dq), std::move(dk), std::move(dv)); +} + + std::tuple _scaled_dot_product_flash_attention_backward_nested( const at::Tensor& grad_out_, const at::Tensor& query, diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp index 4ca777be9cd44..f804670c31538 100644 --- a/aten/src/ATen/native/quantized/QTensor.cpp +++ b/aten/src/ATen/native/quantized/QTensor.cpp @@ -335,6 +335,8 @@ std::tuple choose_qparams_optimized( const int64_t n_bins, const double ratio, int64_t bit_width) { + const float* input_row = input_tensor.const_data_ptr(); + TORCH_CHECK_VALUE(input_row != nullptr, "input tensor is empty and has no data"); if (numel < 0 || numel > input_tensor.numel()) { TORCH_CHECK(false, "numel is out of the bound of input tensor"); @@ -342,7 +344,7 @@ std::tuple choose_qparams_optimized( TORCH_CHECK(numel <= input_tensor.numel(), "numel ", numel, " greater than input_tensor.numel() ", input_tensor.numel()); - const float* input_row = input_tensor.const_data_ptr(); + float xmin = *std::min_element(input_row, input_row + numel); float xmax = *std::max_element(input_row, input_row + numel); float n_bins_float = static_cast(n_bins); diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h index 7722272dfcc27..963a47a21fa9f 100644 --- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h +++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h @@ -460,4 +460,6 @@ at::Tensor _qconv_prepack_onednn( int64_t groups, std::optional> input_shape=std::nullopt); +#define FP8E4M3_MAX 448.0 + #endif // #if AT_MKLDNN_ENABLED() diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp index e9e32e43ae022..42c000ee09d5c 100644 --- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp @@ -17,6 +17,7 @@ #include #include +#include namespace at::native { @@ -53,8 +54,8 @@ static void upsample_nearest2d_out_frame( return; } - std::unique_ptr input_offset_arr(new int64_t[output_width]); - int64_t* input_offset = input_offset_arr.get(); + std::vector input_offset_arr(output_width); + int64_t* input_offset = input_offset_arr.data(); for (const auto w2 : c10::irange(output_width)) { const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width); diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index 8624c9ef03367..3b50bad579023 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -1483,6 +1483,8 @@ static at::Tensor _fp8_convolution_onednn_ref( } y_f32.div_(output_scale); if (x1.scalar_type() == at::kFloat8_e4m3fn) { + // Avoid NaN + y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX); // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8 y_f32 = y_f32.to(at::kHalf); } @@ -1497,6 +1499,8 @@ static at::Tensor _fp8_convolution_onednn_ref( y_f32.div_(output_scale); auto out_dtype = output_dtype.has_value() ? output_dtype.value() : at::kFloat8_e4m3fn; if (out_dtype == at::kFloat8_e4m3fn) { + // Avoid NaN + y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX); // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8 return y_f32.to(at::kHalf).to(out_dtype); } @@ -1730,12 +1734,13 @@ static at::Tensor _quantized_convolution_onednn( output_sizes = at::native::conv_output_size(input_size, kernel_size, padding.vec(), stride.vec(), dilation.vec()); ideep::dims dst_dims = ideep::dims({output_sizes.cbegin(), output_sizes.cend()}); // Output is not a quantized tensor but data type is uint8 + auto out_dtype = output_dtype.has_value() ? output_dtype.value() : act_dtype; at::Tensor output = has_accum_postop_sum ? accum.value() : at::empty( dst_dims, at::device(c10::kCPU) - .dtype(fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : act_dtype)) + .dtype(out_dtype) .memory_format(kSpatialDim == 2 ? c10::MemoryFormat::ChannelsLast : c10::MemoryFormat::ChannelsLast3d) @@ -1755,6 +1760,16 @@ static at::Tensor _quantized_convolution_onednn( unary_scalars, unary_algorithm.has_value() ? unary_algorithm.value() : "" ); + // Avoid NaN if output dtype is fp8 + if (out_dtype == c10::kFloat8_e4m3fn) { + // To avoid NaN, we need to clamp the intermediate results (in fp32) to [-488, 488] + // before converting to fp8 + auto post_ops = op_attr.get_post_ops(); + post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1.0/output_scale, 0.0); + post_ops.append_eltwise(dnnl::algorithm::eltwise_clip, -FP8E4M3_MAX, FP8E4M3_MAX); + op_attr.set_post_ops(post_ops); + output_scale = 1.0f; + } #if IDEEP_PREREQ(3, 1, 0, 0) // Use oneDNN's APIs instead of prepare/compute from ideep to reduce integration overhead. diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp index 1e91fecd45005..807a9b25d3772 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp @@ -333,14 +333,14 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) { weight.scalar_type() == at::ScalarType::Float || weight.scalar_type() == at::ScalarType::Half, "'embedding_bag_byte_prepack' only support float32 or float16."); - const auto weight_sizes = weight.sizes(); - const auto cols_dim = weight_sizes.size() - 1; - const int32_t embedding_cols = static_cast(weight_sizes[cols_dim]); + const auto weight_sizes = weight.sym_sizes(); + const auto cols_dim = weight.ndimension() - 1; + const auto embedding_cols = weight_sizes[cols_dim]; // Add 8 bytes per column to store FP32 scale and zero_point per row. - const int32_t output_columns = static_cast(embedding_cols + 2 * sizeof(float)); + const auto output_columns = embedding_cols + 2 * sizeof(float); // Adjust output dimensions to account for FP32 scale and zero_points. - std::vector output_shape = weight_sizes.vec(); + auto output_shape = weight_sizes.vec(); output_shape.at(cols_dim) = output_columns; at::SymDimVector output_shape_vec(output_shape); diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp index bd6a1086c8cb9..a3a494d16fd69 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp @@ -1012,6 +1012,12 @@ static at::Tensor fp8_qlinear_onednn_ref( "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op sum"); } y_f32.div_(output_scale); + if (x1.scalar_type() == c10::kFloat8_e4m3fn) { + // Avoid NaN + y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX); + // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8 + y_f32 = y_f32.to(at::kHalf); + } x1.copy_(y_f32.to(x1.scalar_type()).view(x1.sizes())); return x1; } else if (binary_post_op == "add") { @@ -1038,6 +1044,12 @@ static at::Tensor fp8_qlinear_onednn_ref( y_f32.div_(output_scale); y_f32 = y_f32.view(output_size); auto out_dtype = output_dtype.has_value() ? output_dtype.value() : at::kFloat8_e4m3fn; + if (out_dtype == at::kFloat8_e4m3fn) { + // Avoid NaN + y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX); + // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8 + return y_f32.to(at::kHalf).to(out_dtype); + } return y_f32.to(out_dtype); } @@ -1118,7 +1130,7 @@ static at::Tensor linear_int8_with_onednn_weight( #if defined(__powerpc__) if (is_fp8) { #else - if(is_fp8 && !cpuinfo_has_x86_amx_int8()) { + if(is_fp8 && !cpuinfo_has_x86_amx_fp16()) { #endif // Fall back to ref impl on old platforms because not supported // Transpose weight to align with behavior in oneDNN @@ -1155,12 +1167,13 @@ static at::Tensor linear_int8_with_onednn_weight( } std::vector src_dims = {M, K}; std::vector dst_dims = {M, N}; + auto out_dtype = output_dtype.has_value() ? output_dtype.value() : input.scalar_type(); at::Tensor output = binary_post_op == "sum" ? other.value() : at::empty( dst_dims, at::device(c10::kCPU) - .dtype(fp32_output ? c10::kFloat : (bf16_output ? c10::kBFloat16 : input.scalar_type())) + .dtype(out_dtype) ); if (output.numel() == 0) { return output; @@ -1195,6 +1208,16 @@ static at::Tensor linear_int8_with_onednn_weight( unary_post_op_args, unary_post_op_algorithm ); + // Avoid NaN if output dtype is fp8 + if (out_dtype == c10::kFloat8_e4m3fn) { + // To avoid NaN, we need to clamp the intermediate results (in fp32) to [-488, 488] + // before converting to fp8 + auto post_ops = op_attr.get_post_ops(); + post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1.0/output_scale, 0.0); + post_ops.append_eltwise(dnnl::algorithm::eltwise_clip, -FP8E4M3_MAX, FP8E4M3_MAX); + op_attr.set_post_ops(post_ops); + output_scale = 1.0f; + } if (input_scale != 1.0f) { op_attr.set_scales_mask(DNNL_ARG_SRC, 0); } diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp index 3bd68feca1c2f..b4ae4e677bcd2 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp @@ -305,7 +305,7 @@ static inline at::Tensor pack_weight_to_onednn_tensor( #if defined(__powerpc__) if (is_fp8){ #else - if(is_fp8 && !cpuinfo_has_x86_amx_int8()) { + if(is_fp8 && !cpuinfo_has_x86_amx_fp16()) { #endif // oneDNN's fp8 requires AMX support // If AMX is not available, fall back to reference implementation diff --git a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp index 5a3f5f14dc0a7..c841da8354b5f 100644 --- a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp +++ b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp @@ -23,6 +23,9 @@ #include #endif +#if AT_USE_EIGEN_SPARSE() +#include +#endif namespace at::native::sparse::impl { @@ -442,13 +445,15 @@ void add_out_sparse_csr( const Tensor& mat2, const Scalar& alpha, const Tensor& result) { -#if !AT_MKL_ENABLED() - TORCH_CHECK( - false, - "Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ", - "Please use PyTorch built MKL support."); -#else +#if AT_USE_MKL_SPARSE() sparse::impl::mkl::add_out_sparse_csr(mat1, mat2, alpha, result); +#elif AT_USE_EIGEN_SPARSE() + sparse::impl::eigen::add_out_sparse(mat1, mat2, alpha, result); +#else + TORCH_CHECK( + false, + "Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ", + "Please use PyTorch built MKL support."); #endif } @@ -459,7 +464,7 @@ void triangular_solve_out_sparse_csr( bool upper, bool transpose, bool unitriangular) { -#if !AT_MKL_ENABLED() +#if !AT_USE_MKL_SPARSE() TORCH_CHECK( false, "Calling triangular_solve on a sparse CPU tensor requires compiling PyTorch with MKL. ", diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp index ba94f98551747..4faa135713d65 100644 --- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp @@ -127,6 +127,10 @@ #include #endif +#if AT_USE_EIGEN_SPARSE() +#include +#endif + #include namespace at { @@ -536,7 +540,12 @@ static void addmm_out_sparse_csr_native_cpu( auto values = sparse.values(); scalar_t cast_alpha = alpha.to(); - r.mul_(beta); + // If beta is zero NaN and Inf should not be propagated to the result + if (beta.toComplexDouble() == 0.) { + r.zero_(); + } else { + r.mul_(beta); + } AT_DISPATCH_INDEX_TYPES( col_indices.scalar_type(), "csr_mm_crow_indices", [&]() { auto csr_accessor = csr.accessor(); @@ -648,6 +657,15 @@ Tensor& addmm_out_sparse_compressed_cpu( return result; } +#if AT_USE_EIGEN_SPARSE() + if ((result.layout() == kSparseCsr || result.layout() == kSparseCsc) && + (mat1.layout() == kSparseCsr || mat1.layout() == kSparseCsc) && + (mat2.layout() == kSparseCsr || mat2.layout() == kSparseCsc)) { + sparse::impl::eigen::addmm_out_sparse(mat1, mat2, result, alpha, beta); + return result; + } +#endif + #if !AT_USE_MKL_SPARSE() // The custom impl addmm_out_sparse_csr_native_cpu only supports CSR @ // strided -> strided diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh index 693ca536a3198..c11588a32ba05 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh @@ -196,9 +196,17 @@ C10_LAUNCH_BOUNDS_1(num_threads()) __global__ void coalesceValuesKernel( int64_t *segment_offsets, int64_t *value_indices, Dtype *values, Dtype *newValues, - int64_t nnz, int64_t newNnz, int64_t stride) { + int64_t nnz, int64_t newNnz, +#ifdef USE_ROCM + int64_t nsegments, +#endif + int64_t stride) { - int seg = blockIdx.x * 4 + threadIdx.y; +#ifdef USE_ROCM + int64_t seg = (blockIdx.x * gridDim.y + blockIdx.y) * 4 + threadIdx.y; +#else + int64_t seg = blockIdx.x * 4 + threadIdx.y; +#endif // Number of values processed by each thread (grain size) const int SZ = 4; @@ -207,7 +215,11 @@ __global__ void coalesceValuesKernel( const int newValueRow = seg * stride; const int begin = segment_offsets[seg]; const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz; +#ifdef USE_ROCM + const int startFeature = threadIdx.x + blockIdx.z * nsegments * SZ; +#else const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ; +#endif Acctype tmp[SZ]; #pragma unroll for (int ii = 0; ii < SZ; ii++) { @@ -250,9 +262,17 @@ C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4) __global__ void coalesceValuesKernel( int64_t *segment_offsets, int64_t *value_indices, bool *values, bool *newValues, - int64_t nnz, int64_t newNnz, int64_t stride) { + int64_t nnz, int64_t newNnz, +#ifdef USE_ROCM + int64_t nsegments, +#endif + int64_t stride) { - int seg = blockIdx.x * 4 + threadIdx.y; +#ifdef USE_ROCM + int64_t seg = (blockIdx.x * gridDim.y + blockIdx.y) * 4 + threadIdx.y; +#else + int64_t seg = blockIdx.x * 4 + threadIdx.y; +#endif // Number of values processed by each thread (grain size) const int SZ = 4; @@ -261,7 +281,11 @@ __global__ void coalesceValuesKernel( const int newValueRow = seg * stride; const int begin = segment_offsets[seg]; const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz; +#ifdef USE_ROCM + const int startFeature = threadIdx.x + blockIdx.z * nsegments * SZ; +#else const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ; +#endif bool tmp[SZ]; #pragma unroll for (int ii = 0; ii < SZ; ii++) { diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu index a36ec9b203fc3..b59221a3231a5 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu @@ -106,8 +106,17 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) { values = values.contiguous(); int64_t stride = c10::multiply_integers(values.sizes().slice(1)); int warp_size = at::cuda::warp_size(); +#ifdef USE_ROCM + const int64_t BATCHING_SEGMENT = 4096; + int64_t nsegments = ceil_div(newNnz, (int64_t) SZ); + int64_t s_batch = ceil_div(nsegments, BATCHING_SEGMENT); + dim3 grid(s_batch, (s_batch == 1) ? nsegments : BATCHING_SEGMENT, ceil_div(stride, (int64_t) warp_size*SZ)); +#else dim3 grid(ceil_div(newNnz, (int64_t) SZ), ceil_div(stride, (int64_t) warp_size*SZ)); +#endif dim3 block(warp_size, SZ); +#ifdef USE_ROCM + // Must duplicate the whole section otherwise does not compile on Windows AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( at::ScalarType::ComplexHalf, at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, values.scalar_type(), "coalesce_sparse_cuda", [&] { @@ -119,10 +128,28 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) { newValues.data_ptr(), nnz, newNnz, + nsegments, stride ); C10_CUDA_KERNEL_LAUNCH_CHECK(); }); +#else + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + at::ScalarType::ComplexHalf, at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, + values.scalar_type(), "coalesce_sparse_cuda", [&] { + using cuda_accscalar_t = acc_type; + apply::coalesceValuesKernel<<>>( + uniqueOffsets.data_ptr(), + origIndices.data_ptr(), + values.data_ptr(), + newValues.data_ptr(), + nnz, + newNnz, + stride + ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); +#endif } // this grid-strided version is slower but probably more flexible diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index fe0ddd087dd3b..3730ceb913547 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -800,7 +800,7 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor Tensor indices_dim1 = indices[1].to(ScalarType::Int); Tensor indices_dim2 = indices[2].to(ScalarType::Int); - std::unique_ptr mat_el_end_indices_host(new int64_t[num_matrices]); + std::vector mat_el_end_indices_host(num_matrices); { auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); @@ -809,14 +809,14 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor search_end_matrix_indices(mat_el_end_indices_device, num_matrices, indices_dim0); AT_CUDA_CHECK(cudaMemcpy( - mat_el_end_indices_host.get(), + mat_el_end_indices_host.data(), mat_el_end_indices_device, num_matrices*sizeof(int64_t), cudaMemcpyDeviceToHost )); } // Need a pointer to an array to access within a lambda - int64_t* mat_el_end_indices = &mat_el_end_indices_host[0]; + int64_t* mat_el_end_indices = mat_el_end_indices_host.data(); Scalar beta = 0; Scalar alpha = 1; diff --git a/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.cpp new file mode 100644 index 0000000000000..20738992a61d9 --- /dev/null +++ b/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.cpp @@ -0,0 +1,329 @@ +#include + +#if AT_USE_EIGEN_SPARSE() + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include + +#include + +namespace at::native::sparse::impl::eigen { + +namespace { + +void inline sparse_indices_to_result_dtype_inplace( + const c10::ScalarType& dtype, + const at::Tensor& input) { + auto [compressed_indices, plain_indices] = + at::sparse_csr::getCompressedPlainIndices(input); + static_cast(input.unsafeGetTensorImpl()) + ->set_member_tensors( + compressed_indices.to(dtype), + plain_indices.to(dtype), + input.values(), + input.sizes()); +} + +void inline sparse_indices_and_values_resize( + const at::Tensor& input, + int64_t nnz) { + auto [compressed_indices, plain_indices] = + at::sparse_csr::getCompressedPlainIndices(input); + static_cast(input.unsafeGetTensorImpl()) + ->set_member_tensors( + compressed_indices, + plain_indices.resize_({nnz}), + input.values().resize_({nnz}), + input.sizes()); +} + +template +const Eigen::Map> +Tensor_to_Eigen(const at::Tensor& tensor) { + int64_t rows = tensor.size(0); + int64_t cols = tensor.size(1); + int64_t nnz = tensor._nnz(); + TORCH_CHECK(tensor.values().is_contiguous(), "eigen accepts only contiguous tensor values"); + auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(tensor); + index_t* c_indices_ptr = compressed_indices.data_ptr(); + index_t* p_indices_ptr = plain_indices.data_ptr(); + scalar_t* values_ptr = tensor.values().data_ptr(); + Eigen::Map> map( + rows, cols, nnz, c_indices_ptr, p_indices_ptr, values_ptr); + return map; +} + +template +void Eigen_to_Tensor( + const at::Tensor& tensor, + const Eigen::SparseMatrix& matrix) { + const Layout eigen_layout = (eigen_options == Eigen::RowMajor ? kSparseCsr : kSparseCsc); + TORCH_CHECK( + tensor.layout() == eigen_layout, + "Eigen_to_Tensor, expected tensor be ", eigen_layout, ", but got ", + tensor.layout()); + int64_t nnz = matrix.nonZeros(); + int64_t csize = matrix.outerSize(); + sparse_indices_and_values_resize(tensor, nnz); + auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(tensor); + if (nnz > 0) { + std::memcpy( + tensor.values().mutable_data_ptr(), + matrix.valuePtr(), + nnz * sizeof(scalar_t)); + std::memcpy( + plain_indices.mutable_data_ptr(), + matrix.innerIndexPtr(), + nnz * sizeof(index_t)); + } + if (csize > 0) { + std::memcpy( + compressed_indices.mutable_data_ptr(), + matrix.outerIndexPtr(), + csize * sizeof(index_t)); + } + compressed_indices.mutable_data_ptr()[csize] = nnz; +} + +template +void add_out_sparse_eigen( + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Scalar& alpha, + const at::Tensor& result) { + // empty matrices + if (mat1._nnz() == 0 && mat2._nnz() == 0) { + return; + } + + if (mat2._nnz() == 0 || alpha.toComplexDouble() == 0.) { + sparse_indices_and_values_resize(result, mat1._nnz()); + result.copy_(mat1); + return; + } else if (mat1._nnz() == 0) { + sparse_indices_and_values_resize(result, mat2._nnz()); + result.copy_(mat2); + result.values().mul_(alpha); + return; + } + + c10::ScalarType result_index_dtype = at::sparse_csr::getIndexDtype(result); + + sparse_indices_to_result_dtype_inplace(result_index_dtype, mat1); + sparse_indices_to_result_dtype_inplace(result_index_dtype, mat2); + + AT_DISPATCH_INDEX_TYPES( + result_index_dtype, "eigen_sparse_add", [&]() { + scalar_t _alpha = alpha.to(); + + if (result.layout() == kSparseCsr) { + auto mat1_eigen = Tensor_to_Eigen(mat1); + auto mat2_eigen = Tensor_to_Eigen(mat2); + auto mat1_mat2_eigen = (mat1_eigen + _alpha * mat2_eigen); + Eigen_to_Tensor(result, mat1_mat2_eigen); + } else { + auto mat1_eigen = Tensor_to_Eigen(mat1); + auto mat2_eigen = Tensor_to_Eigen(mat2); + auto mat1_mat2_eigen = (mat1_eigen + _alpha * mat2_eigen); + Eigen_to_Tensor(result, mat1_mat2_eigen); + } + }); +} + +template +void addmm_out_sparse_eigen( + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Tensor& result, + const at::Scalar& alpha, + const at::Scalar& beta) { + // empty matrices + if (mat1._nnz() == 0 || mat2._nnz() == 0) { + return; + } + + // If beta is zero NaN and Inf should not be propagated to the result + // In addition, beta = 0 lets us enable a fast-path for result = alpha * A @ B + bool is_beta_zero = false; + if (beta.toComplexDouble() == 0.) { + is_beta_zero = true; + result.values().zero_(); + } else { + result.values().mul_(beta); + } + + c10::ScalarType result_index_dtype = at::sparse_csr::getIndexDtype(result); + + sparse_indices_to_result_dtype_inplace(result_index_dtype, mat1); + sparse_indices_to_result_dtype_inplace(result_index_dtype, mat2); + + AT_DISPATCH_INDEX_TYPES( + result_index_dtype, "eigen_sparse_mm", [&]() { + typedef Eigen::SparseMatrix EigenCsrMatrix; + typedef Eigen::SparseMatrix EigenCscMatrix; + + at::Tensor mat1_mat2; + if (is_beta_zero) { + mat1_mat2 = result; + } else { + mat1_mat2 = at::empty_like(result, result.options()); + } + + if (mat1_mat2.layout() == kSparseCsr) { + if (mat1.layout() == kSparseCsr) { + const auto mat1_eigen = Tensor_to_Eigen(mat1); + if (mat2.layout() == kSparseCsr) { + // Out_csr = M1_csr * M2_csr + const auto mat2_eigen = Tensor_to_Eigen(mat2); + const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen); + Eigen_to_Tensor(mat1_mat2, mat1_mat2_eigen); + } else { + // Out_csr = M1_csr * M2_csc + const auto mat2_eigen = Tensor_to_Eigen(mat2); + const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen); + Eigen_to_Tensor(mat1_mat2, mat1_mat2_eigen); + } + } else { + const auto mat1_eigen = Tensor_to_Eigen(mat1); + if (mat2.layout() == kSparseCsr) { + // Out_csr = M1_csc * M2_csr + const auto mat2_eigen = Tensor_to_Eigen(mat2); + const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen); + Eigen_to_Tensor(mat1_mat2, mat1_mat2_eigen); + } else { + // Out_csr = M1_csc * M2_csc + // This multiplication will be computationally inefficient, as it will require + // additional conversion of the output matrix from CSC to CSR format. + const auto mat2_eigen = Tensor_to_Eigen(mat2); + const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen); + Eigen_to_Tensor(mat1_mat2, mat1_mat2_eigen); + } + } + } else { + if (mat1.layout() == kSparseCsr) { + const auto mat1_eigen = Tensor_to_Eigen(mat1); + if (mat2.layout() == kSparseCsr) { + // Out_csc = M1_csr * M2_csr + // This multiplication will be computationally inefficient, as it will require + // additional conversion of the output matrix from CSR to CSC format. + const auto mat2_eigen = Tensor_to_Eigen(mat2); + const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen); + Eigen_to_Tensor(mat1_mat2, mat1_mat2_eigen); + } else { + // Out_csc = M1_csr * M2_csc + const auto mat2_eigen = Tensor_to_Eigen(mat2); + const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen); + Eigen_to_Tensor(mat1_mat2, mat1_mat2_eigen); + } + } else { + const auto mat1_eigen = Tensor_to_Eigen(mat1); + if (mat2.layout() == kSparseCsr) { + // Out_csc = M1_csc * M2_csr + const auto mat2_eigen = Tensor_to_Eigen(mat2); + const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen); + Eigen_to_Tensor(mat1_mat2, mat1_mat2_eigen); + } else { + // Out_csc = M1_csc * M2_csc + const auto mat2_eigen = Tensor_to_Eigen(mat2); + const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen); + Eigen_to_Tensor(mat1_mat2, mat1_mat2_eigen); + } + } + } + + if (is_beta_zero) { + result.mul_(alpha.to()); + } else { + result.add_(mat1_mat2, alpha.to()); + } + }); +} + +} // anonymous namespace + +void addmm_out_sparse( + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Tensor& result, + const at::Scalar& alpha, + const at::Scalar& beta) { + AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(mat1.layout(), "eigen::addmm_out_sparse:mat1", [&]{}); + AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(mat2.layout(), "eigen::addmm_out_sparse:mat2", [&]{}); + AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(result.layout(), "eigen::addmm_out_sparse:result", [&]{}); + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + result.scalar_type(), "addmm_out_sparse_eigen", [&] { + addmm_out_sparse_eigen(mat1, mat2, result, alpha, beta); + }); +} + +void add_out_sparse( + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Scalar& alpha, + const at::Tensor& result) { + TORCH_CHECK( + (result.layout() == kSparseCsr && mat1.layout() == kSparseCsr && mat2.layout() == kSparseCsr) || + (result.layout() == kSparseCsc && mat1.layout() == kSparseCsc && mat2.layout() == kSparseCsc), + "eigen::add_out_sparse: expected the same layout for all operands but got ", + mat1.layout(), + " + ", + mat2.layout(), + " -> ", + result.layout()); + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + result.scalar_type(), "add_out_sparse_eigen", [&] { + add_out_sparse_eigen(mat1, mat2, alpha, result); + }); +} + +} // namespace at::native::sparse::impl::eigen + +#else + +namespace at::native::sparse::impl::eigen { + +void addmm_out_sparse( + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Tensor& result, + const at::Scalar& alpha, + const at::Scalar& beta) { + TORCH_CHECK( + false, + "eigen::addmm_out_sparse: Eigen was not enabled for ", + result.layout(), + " + ", + mat1.layout(), + " @ ", + mat2.layout()); +} + +void add_out_sparse( + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Scalar& alpha, + const at::Tensor& result) { + TORCH_CHECK( + false, + "eigen::add_out_sparse: Eigen was not enabled for ", + mat1.layout(), + " + ", + mat2.layout(), + " -> ", + result.layout()); +} + +} // namespace at::native::sparse::impl::eigen + +#endif // AT_USE_EIGEN_SPARSE() diff --git a/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.h b/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.h new file mode 100644 index 0000000000000..d8e8dc322bc37 --- /dev/null +++ b/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.h @@ -0,0 +1,29 @@ +#pragma once + +#include + +#if AT_USE_EIGEN_SPARSE() +#ifndef EIGEN_MPL2_ONLY +#define EIGEN_MPL2_ONLY +#endif + +#include + +namespace at::native::sparse::impl::eigen { + +void addmm_out_sparse( + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Tensor& result, + const at::Scalar& alpha, + const at::Scalar& beta); + +void add_out_sparse( + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Scalar& alpha, + const at::Tensor& result); + +} // namespace at::native::sparse::impl::eigen + +#endif diff --git a/aten/src/ATen/native/sparse/mps/FlattenIndices.mm b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm new file mode 100644 index 0000000000000..41efa545cd2a8 --- /dev/null +++ b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm @@ -0,0 +1,73 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native { +namespace { + +using namespace mps; +using namespace at::sparse; + +#ifndef PYTORCH_JIT_COMPILE_SHADERS +static auto& lib = mps::MetalShaderLibrary::getBundledLibrary(); +#else +#include +#endif + +Tensor flatten_indices_mps(const Tensor& indices, IntArrayRef size) { + TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D"); + TORCH_CHECK(static_cast(indices.size(0)) == size.size(), + "flatten_indices: indices.size(0) must equal size.size()"); + + const int64_t sparse_dim = indices.size(0); + const int64_t nnz = indices.size(1); + + if (nnz == 0) { + return at::empty({0}, indices.options().dtype(kLong)); + } + + // Row-major multipliers for flattening: mul[d] = prod_{j>d}(size[j]) + std::vector row_muls(sparse_dim); + row_muls[sparse_dim - 1] = 1; + for (int64_t i = sparse_dim - 2; i >= 0; --i) { + row_muls[i] = row_muls[i + 1] * size[i + 1]; + } + + auto flat_indices = at::empty({nnz}, indices.options().dtype(kLong)); + + auto stream = getCurrentMPSStream(); + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel"); + auto encoder = stream->commandEncoder(); + [encoder setComputePipelineState:pipeline]; + mtl_setArgs(encoder, + indices, + row_muls, + flat_indices, + static_cast(sparse_dim), + indices.strides() + ); + + mtl_dispatch1DJob(encoder, pipeline, nnz); + } + }); + return flat_indices; +} + +} // namespace +REGISTER_MPS_DISPATCH(flatten_indices_stub, &flatten_indices_mps) +} // namespace at::native \ No newline at end of file diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm new file mode 100644 index 0000000000000..3e0ac4e35da1a --- /dev/null +++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm @@ -0,0 +1,183 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native { + +using namespace mps; +using namespace at::sparse; + +#ifndef PYTORCH_JIT_COMPILE_SHADERS +static auto& lib = mps::MetalShaderLibrary::getBundledLibrary(); +#else +#include +#endif + +static Tensor compute_output_positions(const Tensor& is_unique) { + + int64_t nnz = is_unique.size(0); + if (nnz == 0) { + return at::empty({0}, TensorOptions().device(kMPS).dtype(kInt)); + } + + Tensor positions = at::empty({nnz}, TensorOptions().device(kMPS).dtype(kInt)); + + auto stream = getCurrentMPSStream(); + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + auto pipeline = lib.getPipelineStateForFunc("compute_output_positions_kernel"); + auto encoder = stream->commandEncoder(); + [encoder setComputePipelineState:pipeline]; + + mtl_setArgs(encoder, is_unique, positions); + mtl_dispatch1DJob(encoder, pipeline, nnz); + } + }); + + return positions; +} + +static Tensor compute_output_positions_parallel(const Tensor& is_unique) { + + int64_t nnz = is_unique.size(0); + if (nnz == 0) { + return at::empty({0}, TensorOptions().device(kMPS).dtype(kInt)); + } + + // for small arrays, use simple kernel + // speed of the naive kernel drops off after 4096 nnz elements + if (nnz <= 4096) { + return compute_output_positions(is_unique); + } + auto stream = getCurrentMPSStream(); + Tensor positions = is_unique.to(kInt); + // Kogge-Stone parallel prefix sum + Tensor positions_cloned = positions.clone(); + + for (int64_t stride = 1; stride < nnz; stride *= 2) { + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + auto pipeline = lib.getPipelineStateForFunc("kogge_stone_step"); + auto encoder = stream->commandEncoder(); + [encoder setComputePipelineState:pipeline]; + + mtl_setArgs(encoder, positions, positions_cloned, stride); + mtl_dispatch1DJob(encoder, pipeline, nnz); + } + }); + std::swap(positions, positions_cloned); + } + + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + auto pipeline = lib.getPipelineStateForFunc("shift_right_kernel"); + auto encoder = stream->commandEncoder(); + [encoder setComputePipelineState:pipeline]; + + mtl_setArgs(encoder, positions, positions_cloned); + mtl_dispatch1DJob(encoder, pipeline, nnz); + } + }); + + return positions_cloned; +} + +static std::pair mark_unique_and_count(const Tensor& flat_indices) { + + int64_t nnz = flat_indices.size(0); + if (nnz == 0) { + return {at::empty({0}, flat_indices.options().dtype(kBool)), 0}; + } + + Tensor is_unique = at::empty({nnz}, flat_indices.options().dtype(kBool)); + Tensor count_result = at::zeros({1}, flat_indices.options().dtype(kInt)); + + auto stream = getCurrentMPSStream(); + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + auto pipeline = lib.getPipelineStateForFunc("mark_unique_positions_and_count_kernel"); + auto encoder = stream->commandEncoder(); + [encoder setComputePipelineState:pipeline]; + + mtl_setArgs(encoder, flat_indices, is_unique, count_result); + mtl_dispatch1DJob(encoder, pipeline, nnz); + } + }); + + int32_t num_unique = count_result.item(); + + return {is_unique, num_unique}; +} + +SparseTensor _coalesce_sparse_mps(const SparseTensor& self) { + int64_t nnz = self._nnz(); + TORCH_INTERNAL_ASSERT(!self.is_coalesced()); + if (nnz < 2) { + SparseTensor dst = self.clone(); + dst._coalesced_(true); + return dst; + } + + Tensor indices = self._indices(); + Tensor values = self._values(); + + Tensor flat_indices = flatten_indices(indices, self.sizes()); + Tensor sorted_order = flat_indices.argsort(); + Tensor flat_indices_sorted = flat_indices.index({sorted_order}); + values = values.index({sorted_order}); + indices = indices.index_select(1, sorted_order); + + auto unique_info = mark_unique_and_count(flat_indices_sorted); + Tensor is_unique = unique_info.first; + int32_t newNnz = unique_info.second; + + Tensor output_positions = compute_output_positions_parallel(is_unique); + + Tensor out_indices = at::empty({indices.size(0), newNnz}, indices.options()); + auto outValuesSize = values.sizes().vec(); + outValuesSize[0] = newNnz; + Tensor out_values = at::zeros(outValuesSize, values.options()); + + Tensor is_unique_local = is_unique; + int64_t sparse_dim = indices.size(0); + + auto stream = getCurrentMPSStream(); + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + auto pipeline = lib.getPipelineStateForFunc("coalesce_with_positions_kernel_" + scalarToMetalTypeString(values)); + auto encoder = stream->commandEncoder(); + [encoder setComputePipelineState:pipeline]; + + const uint32_t numThreads = static_cast(nnz); + const uint32_t valueSize = static_cast(values.numel() / nnz); + mtl_setArgs(encoder, + flat_indices_sorted, + indices, + values, + is_unique_local, + output_positions, + out_indices, + out_values, + numThreads, + valueSize, + sparse_dim, + newNnz); + mtl_dispatch1DJob(encoder, pipeline, nnz); + } + }); + + SparseTensor result = _sparse_coo_tensor_unsafe_symint(out_indices, out_values, self.sym_sizes())._coalesced_(true); + return result; +} + +} // namespace at::native \ No newline at end of file diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm new file mode 100644 index 0000000000000..07ee2e097b49e --- /dev/null +++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm @@ -0,0 +1,183 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +namespace at::native { + +using namespace at::sparse; + +Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha); + +Tensor& add_out_dense_sparse_mps( + Tensor& out, + const Tensor& dense, + const SparseTensor& sparse, + const Scalar& alpha) { + TORCH_CHECK(dense.is_mps(), "add: expected 'self' to be an MPS tensor, got ", dense.device()); + TORCH_CHECK(sparse.is_mps(), "add: expected 'other' to be an MPS tensor, got ", sparse.device()); + TORCH_CHECK(out.is_mps(), "add: expected 'out' to be an MPS tensor, got ", out.device()); + TORCH_CHECK(dense.sizes().equals(sparse.sizes()), + "add: expected 'self' and 'other' to have same size, but self has size ", + dense.sizes(), " while other has size ", sparse.sizes(), + " (FYI: dense-sparse addition does not currently support broadcasting)"); + + const int64_t nnz = sparse._nnz(); + if (nnz == 0) { + out.resize_as_(dense); + out.copy_(dense); + return out; + } + + auto commonDtype = at::result_type(dense, sparse); + TORCH_CHECK(canCast(commonDtype, out.scalar_type()), + "Can't convert result type ", commonDtype, " to output ", out.scalar_type()); + + Tensor r; + const bool need_separate_buffer = out.is_same(dense) || (out.scalar_type() != commonDtype); + if (need_separate_buffer) { + r = at::empty(dense.sizes(), out.options().dtype(commonDtype)); + } else { + r = out; + r.resize_as_(dense); + } + + Tensor dense_buffer = dense.to(commonDtype); + if (!r.is_same(dense_buffer)) { + r.copy_(dense_buffer); + } + + Tensor indices = sparse._indices(); + Tensor values = sparse._values().to(commonDtype); + if (values.numel() == 0) { + if (!out.is_same(r)) { + out.resize_as_(dense); + out.copy_(r); + } + return out; + } + + const int64_t nDim = r.dim(); + const int64_t nDimI = sparse.sparse_dim(); + TORCH_CHECK(nDimI >= 0 && nDimI <= nDim, + "Invalid sparse_dim=", nDimI, " for dense tensor of dim ", nDim); + + Tensor indices1D = at::sparse::flatten_indices(indices, sparse.sizes()).contiguous(); + + int64_t view_rows = 1; + int64_t view_cols = 1; + for (int64_t i = 0; i < nDimI; i++) { + view_rows *= r.size(i); + } + for (int64_t i = nDimI; i < nDim; i++) { + view_cols *= r.size(i); + } + + if (view_cols == 1) { + Tensor r_flat = r.reshape({view_rows}); + Tensor values_1d = values.reshape({nnz}); + r_flat.index_add_(0, indices1D, values_1d, alpha); + } else { + Tensor r_view = r.view({view_rows, view_cols}); + Tensor values_2d = values.reshape({nnz, view_cols}); + r_view.index_add_(0, indices1D, values_2d, alpha); + } + + if (!out.is_same(r)) { + out.resize_as_(dense); + out.copy_(r); + } + return out; +} + + +SparseTensor& add_out_sparse_mps(const SparseTensor& self, + const SparseTensor& other, + const Scalar& alpha, + SparseTensor& out) { + TORCH_CHECK(other.is_sparse(), "add(sparse, dense) is not supported. Use add(dense, sparse) instead."); + TORCH_CHECK(self.is_mps(), "add: expected 'self' to be MPS, but got ", self.device()); + TORCH_CHECK(other.is_mps(), "add: expected 'other' to be MPS, but got ", other.device()); + TORCH_CHECK(out.is_mps(), "add: expected 'out' to be MPS, but got ", out.device()); + if (!self.is_sparse()) { + return add_out_dense_sparse_mps(out, self, other, alpha); + } + auto commonDtype = at::result_type(self, other); + TORCH_CHECK(canCast(commonDtype, out.scalar_type()), + "Can't convert result type ", commonDtype, " to output ", out.scalar_type()); + + TORCH_CHECK(self.sizes().equals(other.sizes()), + "add: expected 'self' and 'other' to have same size, but ", self.sizes(), " != ", other.sizes()); + + if (other._nnz() == 0) { + out.resize_as_(self); + Tensor vals = self._values(); + if (vals.scalar_type() != out.scalar_type()) { + vals = vals.to(out.scalar_type()); + } + alias_into_sparse(out, self._indices(), vals); + out._coalesced_(self.is_coalesced()); + return out; + } + + if (self._nnz() == 0) { + out.resize_as_(other); + Tensor vals = other._values(); + if (!alpha.isIntegral(false) || alpha.to() != 1.0) { + vals = at::mul(vals, alpha); + } + if (vals.scalar_type() != out.scalar_type()) { + vals = vals.to(out.scalar_type()); + } + alias_into_sparse(out, other._indices(), vals); + out._coalesced_(other.is_coalesced()); + return out; + } + + TORCH_CHECK(is_same_density(self, other), + "add: expected 'self' and 'other' to have same density, but 'self' has ", + self.sparse_dim(), " sparse dimensions while 'other' has ", other.sparse_dim(), " sparse dimensions"); + + Tensor t_indices_ = self._indices(); + Tensor s_indices_ = other._indices(); + + Tensor t_values_ = self._values().to(commonDtype); + Tensor s_values_ = other._values().to(commonDtype); + if (!alpha.isIntegral(false) || alpha.to() != 1.0) { + s_values_ = at::mul(s_values_, alpha); + } + + Tensor r_indices_ = at::cat({t_indices_, s_indices_}, 1); + Tensor r_values_ = at::cat({t_values_, s_values_ }, 0); + + SparseTensor tmp = empty({0}, out.options().dtype(commonDtype)); + tmp.resize_as_(other); + alias_into_sparse(tmp, r_indices_, r_values_); + tmp = _coalesce_sparse_mps(tmp); + + out.resize_as_(other); + Tensor out_vals = tmp._values(); + if (out.scalar_type() != commonDtype) { + out_vals = out_vals.to(out.scalar_type()); + } + alias_into_sparse(out, tmp._indices(), out_vals); + out._coalesced_(tmp.is_coalesced()); + + return out; +} + +} // namespace at::native \ No newline at end of file diff --git a/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal b/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal new file mode 100644 index 0000000000000..e32d1edf1c2f6 --- /dev/null +++ b/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal @@ -0,0 +1,117 @@ +#include +#include +using namespace metal; + + +kernel void compute_output_positions_kernel( + device const bool* is_unique [[buffer(0)]], + device int* positions [[buffer(1)]], + uint gid [[thread_position_in_grid]]) { + int pos = 0; + for (uint i = 0; i < gid; i++) { + if (is_unique[i]) + pos++; + } + positions[gid] = pos; +} + +kernel void mark_unique_positions_and_count_kernel( + device const int64_t* flat_indices [[buffer(0)]], + device bool* is_unique [[buffer(1)]], + device atomic_int* count [[buffer(2)]], + uint tid [[thread_position_in_grid]]) { + bool unique = (tid == 0) || (flat_indices[tid] != flat_indices[tid - 1]); + is_unique[tid] = unique; + + if (unique) { + atomic_fetch_add_explicit(count, 1, memory_order_relaxed); + } +} + +// Kogge-Stone parallel prefix sum step +kernel void kogge_stone_step( + device const int* input [[buffer(0)]], + device int* output [[buffer(1)]], + constant uint& stride [[buffer(2)]], + uint gid [[thread_position_in_grid]]) { + int val = input[gid]; + if (gid >= stride) { + val += input[gid - stride]; + } + output[gid] = val; +} + +// Shift right for exclusive scan +kernel void shift_right_kernel( + device const int* input [[buffer(0)]], + device int* output [[buffer(1)]], + uint gid [[thread_position_in_grid]]) { + output[gid] = (gid == 0) ? 0 : input[gid - 1]; +} + +template +kernel void coalesce_with_positions_kernel( + device const int64_t* flat_indices [[buffer(0)]], + device const int64_t* indices [[buffer(1)]], + device const T* in_values [[buffer(2)]], + device const bool* is_unique [[buffer(3)]], + device const int* output_positions [[buffer(4)]], + device int64_t* out_indices [[buffer(5)]], + device T* out_values [[buffer(6)]], + constant uint& nnz [[buffer(7)]], + constant uint& value_size [[buffer(8)]], + constant uint& sparse_dim [[buffer(9)]], + constant uint& total_unique [[buffer(10)]], + uint gid [[thread_position_in_grid]]) { + if (!is_unique[gid]) + return; + + int out_pos = output_positions[gid]; + + for (uint d = 0; d < sparse_dim; d++) { + out_indices[d * total_unique + out_pos] = indices[d * nnz + gid]; + } + + int64_t current_index = flat_indices[gid]; + uint end = gid + 1; + while (end < nnz && flat_indices[end] == current_index) { + end++; + } + + for (uint elem = 0; elem < value_size; elem++) { + T sum = 0; + for (uint j = gid; j < end; j++) { + sum += in_values[j * value_size + elem]; + } + out_values[out_pos * value_size + elem] = sum; + } +} + +#define INSTANTIATE_COALESCE_WITH_POSITIONS(DTYPE) \ + template \ + [[host_name("coalesce_with_positions_kernel_" #DTYPE)]] [[kernel]] void \ + coalesce_with_positions_kernel( \ + device const int64_t* flat_indices [[buffer(0)]], \ + device const int64_t* indices [[buffer(1)]], \ + device const DTYPE* in_values [[buffer(2)]], \ + device const bool* is_unique [[buffer(3)]], \ + device const int* output_positions [[buffer(4)]], \ + device int64_t* out_indices [[buffer(5)]], \ + device DTYPE* out_values [[buffer(6)]], \ + constant uint& nnz [[buffer(7)]], \ + constant uint& value_size [[buffer(8)]], \ + constant uint& sparse_dim [[buffer(9)]], \ + constant uint& total_unique [[buffer(10)]], \ + uint gid [[thread_position_in_grid]]); + +INSTANTIATE_COALESCE_WITH_POSITIONS(float); +INSTANTIATE_COALESCE_WITH_POSITIONS(half); +INSTANTIATE_COALESCE_WITH_POSITIONS(bfloat); +INSTANTIATE_COALESCE_WITH_POSITIONS(bool); +INSTANTIATE_COALESCE_WITH_POSITIONS(long); +INSTANTIATE_COALESCE_WITH_POSITIONS(char); +INSTANTIATE_COALESCE_WITH_POSITIONS(uchar); +INSTANTIATE_COALESCE_WITH_POSITIONS(short); +INSTANTIATE_COALESCE_WITH_POSITIONS(int); +INSTANTIATE_COALESCE_WITH_POSITIONS(float2); +INSTANTIATE_COALESCE_WITH_POSITIONS(half2); \ No newline at end of file diff --git a/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal b/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal new file mode 100644 index 0000000000000..00156dddb06c2 --- /dev/null +++ b/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal @@ -0,0 +1,19 @@ +#include +using namespace metal; + + +kernel void flatten_indices_kernel( + device const long* indices [[ buffer(0) ]], + device const long* row_muls [[ buffer(1) ]], + device long* flat_indices [[ buffer(2) ]], + constant uint& sparse_dim [[ buffer(3) ]], + constant long2& idx_strides [[ buffer(4) ]], + uint gid [[ thread_position_in_grid ]]) { + long flat = 0; + for (uint d = 0; d < sparse_dim; ++d) { + long off = (long)d * idx_strides.x + (long)gid * idx_strides.y; + long v = indices[off]; + flat += v * row_muls[d]; + } + flat_indices[gid] = flat; +} \ No newline at end of file diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index 80049aa9a832f..b8b43e0086c1a 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -849,16 +849,6 @@ std::tuple _efficient_ if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { -#if defined(USE_CK_FLASH_ATTENTION) +#if defined(USE_ROCM_CK_SDPA) std::optional out(res); std::optional seqused_k = std::nullopt; std::optional alibi_slopes = std::nullopt; @@ -1406,12 +1396,15 @@ std::tuple _efficient_ at::Tensor v_t = value.transpose(1, 2); at::Tensor output_t = res.transpose(1, 2); bool is_causal; - if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { - is_causal = true; - } else if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { + if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { is_causal = false; } else { - TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now"); + is_causal = true; +#if AOTRITON_V3_API == 0 + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) { + TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now"); + } +#endif } at::Tensor atomic_counter; @@ -1436,7 +1429,51 @@ std::tuple _efficient_ auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr() : nullptr); auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr() : nullptr); hipError_t err; // TODO: Error handling - if (seqstart_q.has_value()) { + if constexpr (AOTRITON_ALWAYS_V3_API) { // Better readability than nesting ifdef +#if AOTRITON_V3_API // if constexpr does not stop errors from undefined functions + using aotriton::v3::flash::CausalType; + using aotriton::v3::flash::VarlenType; + using aotriton::v3::flash::WindowValue; + aotriton::v3::flash::attn_fwd_params params; + params.Q = mk_aotensor(q_t, "q"); + params.K = mk_aotensor(k_t, "k"); + params.V = mk_aotensor(v_t, "v"); + params.Sm_scale = softmax_scale; + params.L = compute_logsumexp ? mk_aotensor<2>(softmax_lse, "M") : empty_t2; + params.Out = mk_aotensor(output_t, "Out"); + params.Max_seqlen_q = max_seqlen_q; // Unused if cu_seqlens_q is empty + params.Max_seqlen_k = max_seqlen_k; // Unused if cu_seqlens_k is empty + params.dropout_p = dropout_p; + params.philox_seed_ptr = seed; + params.philox_offset1 = offset1; + params.philox_offset2 = offset2; + params.philox_seed_output = seed_output; + params.philox_offset_output = offset_output; + params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax"); + params.persistent_atomic_counter = persistent_counter; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { + params.window_left = WindowValue::TopLeftAligned; + params.window_right = WindowValue::TopLeftAligned; + } else if (static_cast(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) { + params.window_left = WindowValue::BottomRightAligned; + params.window_right = WindowValue::BottomRightAligned; + } + if (bias.has_value()) { + params.B = mk_aotensor(bias.value(), "bias"); + } + if (seqstart_q.has_value()) { + params.varlen_type = VarlenType::CompactVarlen; + params.cu_seqlens_q = mk_aotensor<1>(seqstart_q.value(), "cu_seqlens_q"); + params.cu_seqlens_k = mk_aotensor<1>(seqstart_k.value(), "cu_seqlens_k"); + } else { + params.varlen_type = VarlenType::None; + } + err = aotriton::v3::flash::attn_fwd(params, + aotriton::v3::flash::attn_fwd_params::kVersion, + stream); +#endif // AOTRITON_V3_API + } else if (seqstart_q.has_value()) { // varlen aka nested tensor err = attn_fwd_compact_varlen(mk_aotensor(q_t, "q"), mk_aotensor(k_t, "k"), diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index 3888df64ad80b..55fc1e261219e 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -24,8 +24,11 @@ #include #include #else +#include #include #include +#include +#include #include #include #include @@ -45,6 +48,7 @@ #include #include #else +#include // MemoryEfficient Attention Specific Imports for ROCM #ifndef DISABLE_AOTRITON #include @@ -184,7 +188,7 @@ std::tuple _flash_attention_backward( return std::make_tuple(Tensor(), Tensor(), Tensor()); } -std::tuple _scaled_dot_product_cudnn_attention_backward_cuda( +std::tuple _cudnn_attention_backward( const Tensor& grad_out, const Tensor& query, const Tensor& key, @@ -211,57 +215,117 @@ std::tuple _scaled_dot_product_cudnn_attention_backward_ } } - const int64_t batch_size = query.size(0); - const int64_t num_heads = query.size(1); - const int64_t head_dim_qk = query.size(3); - const int64_t head_dim_v = value.size(3); + const bool is_nested = cum_seq_q.defined(); const int64_t max_seqlen_batch_q = query.size(2); const int64_t max_seqlen_batch_k = key.size(2); - // This is needed because SaveVariable automatically converts - // std::optional to undefined tensor - std::optional attn_bias_; - if (attn_bias.defined()) { - attn_bias_ = attn_bias; - } - if (attn_bias_.has_value()) { - const auto bias_dim = attn_bias_.value().dim(); - if (bias_dim == 2) { - attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k}); - } else if (bias_dim == 3) { - attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k}); - } else { - TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D"); - attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k}); + if (!is_nested) { + const int64_t batch_size = query.size(0); + const int64_t num_heads = query.size(1); + const int64_t head_dim_qk = query.size(3); + const int64_t head_dim_v = value.size(3); + + // This is needed because SaveVariable automatically converts + // std::optional to undefined tensor + std::optional attn_bias_; + if (attn_bias.defined()) { + attn_bias_ = attn_bias; + } + if (attn_bias_.has_value()) { + const auto bias_dim = attn_bias_.value().dim(); + if (bias_dim == 2) { + attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k}); + } else if (bias_dim == 3) { + attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k}); + } else { + TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D"); + attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k}); + } } - } - const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); - auto dq = at::empty_like(query); - auto dk = at::empty_like(key); - auto dv = at::empty_like(value); - run_cudnn_SDP_bprop(batch_size /*int64_t b*/, - num_heads /*int64_t h*/, - max_q/*int64_t s_q*/, - max_k/*int64_t s_kv*/, - head_dim_qk /*int64_t d_qk*/, - head_dim_v /*int64_t d_v*/, - softmax_scale /*float scaling_factor*/, - is_causal /*bool is_causal*/, - dropout_p /*float dropout_probability*/, - query /*const Tensor& q*/, - key /*const Tensor& k*/, - value /*const Tensor& v*/, - attn_bias_ /*const std::optional& attn_bias*/, - out /*const Tensor& o*/, - grad_out/*const Tensor& dO*/, - logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/, - dq/*Tensor& dQ*/, - dk/*Tensor& dK*/, - dv/*Tensor& dV*/, - philox_seed/*Tensor& dropoutseed*/, - philox_offset/*Tensor& dropoutoffset*/); - return std::make_tuple(std::move(dq), std::move(dk), std::move(dv)); + const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + auto dq = at::empty_like(query); + auto dk = at::empty_like(key); + auto dv = at::empty_like(value); + run_cudnn_SDP_bprop(batch_size /*int64_t b*/, + num_heads /*int64_t h*/, + max_q/*int64_t s_q*/, + max_k/*int64_t s_kv*/, + head_dim_qk /*int64_t d_qk*/, + head_dim_v /*int64_t d_v*/, + softmax_scale /*float scaling_factor*/, + is_causal /*bool is_causal*/, + dropout_p /*float dropout_probability*/, + query /*const Tensor& q*/, + key /*const Tensor& k*/, + value /*const Tensor& v*/, + attn_bias_ /*const std::optional& attn_bias*/, + out /*const Tensor& o*/, + grad_out/*const Tensor& dO*/, + logsumexp/*const Tensor& softmaxstats*/, + dq/*Tensor& dQ*/, + dk/*Tensor& dK*/, + dv/*Tensor& dV*/, + philox_seed/*Tensor& dropoutseed*/, + philox_offset/*Tensor& dropoutoffset*/); + return std::make_tuple(std::move(dq), std::move(dk), std::move(dv)); + } else { + // BHSD ... + const int64_t batch_size = cum_seq_q.size(0) - 1; + const int64_t num_heads_q = query.size(-2); + const int64_t num_heads_k = key.size(-2); + const int64_t num_heads_v = value.size(-2); + const int64_t head_dim_qk = query.size(-1); + const int64_t head_dim_v = value.size(-1); + std::optional attn_bias_; + if (attn_bias.defined()) { + attn_bias_ = attn_bias; + } + if (attn_bias_.has_value()) { + const auto bias_dim = attn_bias_.value().dim(); + if (bias_dim == 2) { + attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k}); + } else if (bias_dim == 3) { + attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k}); + } else { + attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k}); + TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D"); + } + } + + auto dq = at::empty_like(query); + auto dk = at::empty_like(key); + auto dv = at::empty_like(value); + + const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked(); + run_cudnn_SDP_bprop_nestedtensor( + batch_size, + num_heads_q, + num_heads_k, + num_heads_v, + max_seqlen_batch_q, + max_seqlen_batch_k, + head_dim_qk, + head_dim_v, + softmax_scale, + is_causal, + dropout_p, + cum_seq_q, + cum_seq_k, + query, + key, + value, + attn_bias_, + out, + grad_out, + logsumexp, + dq, + dk, + dv, + philox_seed, + philox_offset); + return std::make_tuple(std::move(dq), std::move(dk), std::move(dv)); + } } std::tuple @@ -431,7 +495,7 @@ _efficient_attention_backward( // ROCM Implementation if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { -#if defined(USE_CK_FLASH_ATTENTION) +#if defined(USE_ROCM_CK_SDPA) const auto my_softmax_scale = sdp::calculate_scale(query, scale).expect_float(); // Store grad_bias in optional std::optional opt_grad_bias = grad_bias; @@ -482,12 +546,15 @@ _efficient_attention_backward( } const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); bool is_causal; - if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { - is_causal = true; - } else if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { + if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { is_causal = false; } else { - TORCH_CHECK(false, "[_efficient_attention_backward] Unsupported mask type in AOTriton, for now"); + is_causal = true; +#if AOTRITON_V3_API == 0 + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) { + TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now"); + } +#endif } at::Tensor q_t = query.permute({0,2,1,3}); at::Tensor k_t = key.permute({0,2,1,3}); @@ -506,7 +573,62 @@ _efficient_attention_backward( using sdp::aotriton_adapter::mk_aoscalartensor; using sdp::aotriton_adapter::cast_dtype; aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype())); - if (cu_seqlens_q.has_value()) { + if constexpr (AOTRITON_ALWAYS_V3_API) { // Better readability than nesting ifdef +#if AOTRITON_V3_API // if constexpr does not stop errors from undefined functions + using aotriton::v3::flash::CausalType; + using aotriton::v3::flash::VarlenType; + using aotriton::v3::flash::WindowValue; + aotriton::v3::flash::attn_bwd_params params; + params.Q = mk_aotensor(q_t, "q"); + params.K = mk_aotensor(k_t, "k"); + params.V = mk_aotensor(v_t, "v"); + params.B = bias.has_value() ? mk_aotensor(bias.value(), "bias") : empty_t4; + params.Sm_scale = softmax_scale; + params.Out = mk_aotensor(out_t, "out"); + params.DO = mk_aotensor(dout_t, "dout"); + params.DK = mk_aotensor(dk_t, "dk"); + params.DV = mk_aotensor(dv_t, "dv"); + params.DQ = mk_aotensor(dq_t, "dq"); + params.DB = bias_requires_grad ? mk_aotensor(grad_bias, "db") : empty_t4; + params.L = mk_aotensor<2>(softmax_lse, "L"); + params.Max_seqlen_q = max_seqlen_q; // Unused if cu_seqlens_q is empty + params.Max_seqlen_k = max_seqlen_k; // Unused if cu_seqlens_k is empty + params.dropout_p = float(dropout_p); + params.philox_seed_ptr = mk_aoscalartensor(philox_seed); + params.philox_offset1 = mk_aoscalartensor(philox_offset); + params.philox_offset2 = 0; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { + params.window_left = WindowValue::TopLeftAligned; + params.window_right = WindowValue::TopLeftAligned; + } else if (static_cast(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) { + params.window_left = WindowValue::BottomRightAligned; + params.window_right = WindowValue::BottomRightAligned; + } +#if AOTRITON_ALWAYS_V3_API + using sdp::aotriton_adapter::mklazy_empty_like; + using sdp::aotriton_adapter::mklazy_fp32zeros; + using sdp::aotriton_adapter::LazyTensorContext; + LazyTensorContext lazy_delta { .like_tensor = softmax_lse, .tensor_name = "delta" }; + LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" }; + params.D = mklazy_empty_like<2>(&lazy_delta); + params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc); +#else + at::Tensor delta = at::empty_like(softmax_lse).contiguous(); + params.D = mk_aotensor<2>(delta, "delta"); +#endif + if (cu_seqlens_q.has_value()) { + params.varlen_type = VarlenType::CompactVarlen; + params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q.value(), "cu_seqlens_q"); + params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k.value(), "cu_seqlens_k"); + } else { + params.varlen_type = VarlenType::None; + } + err = aotriton::v3::flash::attn_bwd(params, + aotriton::v3::flash::attn_bwd_params::kVersion, + stream); +#endif // AOTRITON_V3_API + } else if (cu_seqlens_q.has_value()) { at::Tensor delta = at::empty_like(softmax_lse).contiguous(); // varlen aka Nested tensor err = attn_bwd_compact_varlen(mk_aotensor(q_t, "q"), @@ -1063,4 +1185,40 @@ std::tuple _scaled_dot_product_e } } +std::tuple _scaled_dot_product_cudnn_attention_backward_cuda( + const Tensor& grad_out, + const Tensor& query, + const Tensor& key, + const Tensor& value, + const Tensor& out, + const Tensor& logsumexp, + const Tensor& philox_seed, + const Tensor& philox_offset, + const Tensor& attn_bias, + const Tensor& cum_seq_q, + const Tensor& cum_seq_k, + const int64_t max_q, + const int64_t max_k, + double dropout_p, + bool is_causal, + std::optional scale) { + return at::_cudnn_attention_backward( + grad_out, + query, + key, + value, + out, + logsumexp, + philox_seed, + philox_offset, + attn_bias, + cum_seq_q, + cum_seq_k, + max_q, + max_k, + dropout_p, + is_causal, + scale); +} + } // namespace at::native diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp index 4b198f4d6d2de..c826ef1ab8b15 100644 --- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #if AT_CUDNN_ENABLED() #include @@ -25,9 +26,12 @@ #if USE_ROCM #if defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION) +#include #include #define USE_ROCM_ATTENTION 1 #endif +#else +#define USE_ROCM_ATTENTION 0 #endif // Avoid potential compiler -Wall -Werror complains undefined macro @@ -57,21 +61,29 @@ namespace sdp { namespace { +// tracks whether we've set the default priority order once, to avoid setting +// it redundantly or overwriting a user-specified priority order +// when the priority order context manager is used before the default priority +// order is initialized the following happens: +// (1) the current priority order is queried +// (2) priority_order() is called, which initializes it to the default as init_ is false +// (3) the user-specified priority order is set +// (3.1) we are in the priority context... +// (3.2) we exit the priority context... +// (4) the previous priority order (default) is restored +bool priority_order_init_ = false; + // TODO(eqy): more benchmarking to determine whether this should include sm86/89 // Needs to be kept in-sync with test_fused_chocie in test_transformers.py bool check_prefer_cudnn_attention() { - // TODO(eqy): Re-enable by default after upgrading to a release later than 9.5.0 - // see context: https://github.com/pytorch/pytorch/issues/138340 - // return false; -#if defined(CUDNN_VERSION) - -#if CUDNN_VERSION > 90000 + static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") != false; + if (!prefer_cudnn) { + return false; + } +#if (defined(CUDNN_VERSION) && (CUDNN_VERSION >= 90900)) auto dprops = at::cuda::getCurrentDeviceProperties(); - return dprops->major >= 9; -#else - return false; -#endif - + auto major = dprops->major; + return (major == 9 || major == 10) && !dprops->minor; #else return false; #endif @@ -79,6 +91,16 @@ bool check_prefer_cudnn_attention() { // flash_attention V2 is universally faster than efficient_attention and Math std::array priority_order(sdp_params const& params) { + if (!priority_order_init_) { + priority_order_init_ = true; + if (check_prefer_cudnn_attention()) { + const std::vector cudnn_order = {static_cast(at::SDPBackend::cudnn_attention), + static_cast(at::SDPBackend::flash_attention), + static_cast(at::SDPBackend::efficient_attention), + static_cast(at::SDPBackend::math)}; + at::globalContext().setSDPPriorityOrder(cudnn_order); + } + } return at::globalContext().sDPPriorityOrder(); } @@ -112,9 +134,24 @@ int64_t minimum_gemm_alignment(sdp_params const& params) { // caller_is_meff is added to make the TORCH_WARN message showing the correct result template bool check_head_dim_size_flash(sdp_params const& params, bool debug) { -#if USE_ROCM_ATTENTION && AOTRITON_VERSION_MINOR >= 9 +#if USE_ROCM_ATTENTION // AOTriton 0.9+ supports head_dim up to 512 - const auto max_size = c10::SymInt(512); + const static auto max_hdim = []() { +#if AOTRITON_VERSION_CURRENT == AOTRITON_VERSION_INT(0, 11) + // gfx11xx only support hdim <= 256 on AOTriton 0.11 + auto dprops = at::cuda::getCurrentDeviceProperties(); + const c10::basic_string_view arch(dprops->gcnArchName); + if (arch.starts_with("gfx11")) { + return 256; + } +#endif // AOTriton 0.11 +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 9) + return 512; +#else + return 256; +#endif + }(); + const auto max_size = c10::SymInt(max_hdim); #else // All head_dim sizes must be equal and less than 256 const auto max_size = c10::SymInt(256); @@ -414,9 +451,9 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) { return false; } auto head_dim_limit = 128; - if (cudnn_version >= 90501) { + if (cudnn_version >= 91000) { auto dprops = at::cuda::getCurrentDeviceProperties(); - if (dprops->major == 9 && !dprops->minor) { + if (dprops->major == 9 && !dprops->minor) { head_dim_limit = 256; } } @@ -453,9 +490,15 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) { return false; } } - if (s_q == 1 || s_k == 1) { + if (s_k == 1) { + if (debug) { + TORCH_WARN_ONCE("cudnn SDPA does not support key/value sequence length 1."); + } + return false; + } + if (s_q == 1 && params.dropout != 0.0) { if (debug) { - TORCH_WARN_ONCE("cudnn SDPA does not support sequence length 1."); + TORCH_WARN_ONCE("cudnn SDPA does not support query sequence length 1 with dropout."); } return false; } @@ -563,9 +606,9 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) { const auto dprop = at::cuda::getCurrentDeviceProperties(); // Check that the input is nested - if (dprop->major != 9 && has_for_nested_inputs(params)) { + if (!(dprop->major == 9 || dprop->major == 10) && has_for_nested_inputs(params)) { if (debug) { - TORCH_WARN("CuDNN SDPA supports nested tensors on SM 9.0."); + TORCH_WARN("cuDNN SDPA supports nested tensors on SM 9.0, SM 10.0."); } return false; } @@ -589,7 +632,7 @@ bool check_runtime_disabled_cudnn(sdp_params const& params, bool debug) { // sdp kernels if (!at::globalContext().userEnabledCuDNNSDP()) { if (debug) { - TORCH_WARN("CuDNN attention has been runtime disabled."); + TORCH_WARN("cuDNN attention has been runtime disabled."); } return false; } @@ -620,7 +663,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) { #endif #if defined(CUDNN_VERSION) && CUDNN_VERSION < 90000 if (debug) { - TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use CuDNN Attention (< v9.0.0)"); + TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use cuDNN Attention (< v9.0.0)"); } return false; #endif @@ -630,10 +673,8 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) { c10::array_of( check_runtime_disabled_cudnn, check_for_nested_inputs, - check_nonzero_sequence_lengths_dense, check_all_tensors_on_device, check_tensor_shapes, - check_cudnn_tensor_shapes, check_cudnn_deterministic, check_dtypes_low_precision, check_attn_mask_shape, @@ -646,8 +687,10 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) { } constexpr auto dense_constraints = c10::array_of( + check_nonzero_sequence_lengths_dense, check_last_dim_stride_equals_1_dense, - check_batch_size_and_num_heads_dense + check_batch_size_and_num_heads_dense, + check_cudnn_tensor_shapes ); if (has_only_dense_inputs(params)) { @@ -864,7 +907,7 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) { sdp::can_use_mem_efficient_attention(kernel_params, print_debug); TORCH_WARN("Flash attention kernel not used because:"); sdp::can_use_flash_attention(kernel_params, print_debug); - TORCH_WARN("CuDNN attention kernel not used because:"); + TORCH_WARN("cuDNN attention kernel not used because:"); sdp::can_use_cudnn_attention(kernel_params, print_debug); TORCH_CHECK(!print_debug, "No available kernel. Aborting execution.") return SDPBackend::error; diff --git a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h index aedb205e57101..d316808cf9bef 100644 --- a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h +++ b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h @@ -2,8 +2,12 @@ #ifdef USE_ROCM +// Expect to be included after headers of at::zeros_like and at::empty_like + #include #include +#include +#include //////////////////////////////////////////////////////////////////////////////// // Common macros copied from cuda/mem_eff_attention/gemm_kernel_utils.h @@ -111,6 +115,61 @@ inline aotriton::TensorView<0> mk_atomictensor(const int32_t* ptr) aotriton::DType::kInt32); } +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11) + +struct LazyTensorContext { + at::Tensor like_tensor; + std::string_view tensor_name; + at::Tensor tensor; +}; + +template +struct LazyTensorFunctions : public LazyTensorContext { + static aotriton::TensorView acquire(void* cookie) { + auto ctx = (LazyTensorContext*)cookie; + if (!ctx->tensor.defined()) { + auto q = ctx->like_tensor; + if constexpr (kRequireZeros) { + ctx->tensor = at::zeros(q.sizes(), + q.options().dtype(at::kFloat)); + } else { + ctx->tensor = at::empty_like(q); + } + } + return mk_aotensor(ctx->tensor, ctx->tensor_name); + } + + static void dispose(void* cookie) { + } +}; + +template +aotriton::LazyTensor mklazy_common(LazyTensorContext* cookie) +{ + using LTF = LazyTensorFunctions; + return aotriton::LazyTensor { + .cookie = cookie, + .acquire = <F::acquire, + .dispose = <F::dispose + }; +} + +template +auto mklazy_empty_like(LazyTensorContext* cookie) +{ + return mklazy_common(cookie); +} + + +// Note: this will not keep the original strides +template +auto mklazy_fp32zeros(LazyTensorContext* cookie) +{ + return mklazy_common(cookie); +} + +#endif // >= 0.11 + } // namespace aotriton_adapter } // namespace sdp diff --git a/aten/src/ATen/native/transformers/hip/aotriton_versions.h b/aten/src/ATen/native/transformers/hip/aotriton_versions.h new file mode 100644 index 0000000000000..2f5d3f0e12228 --- /dev/null +++ b/aten/src/ATen/native/transformers/hip/aotriton_versions.h @@ -0,0 +1,20 @@ +#pragma once + +#ifdef USE_ROCM + +#define AOTRITON_VERSION_INT(x, y) (x * 100 + y) +#define AOTRITON_VERSION_CURRENT (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR) + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11) +#define AOTRITON_ALWAYS_V3_API 1 +#else +#define AOTRITON_ALWAYS_V3_API 0 +#endif + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 10) +#define AOTRITON_V3_API 1 +#else +#define AOTRITON_V3_API 0 +#endif + +#endif diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip index 05523f75caa42..b5b1ed4292896 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip @@ -60,20 +60,13 @@ #include // AOTriton headers -#include #include #include -#if AOTRITON_VERSION_MINOR < 9 +#if AOTRITON_VERSION_CURRENT < AOTRITON_VERSION_INT(0, 9) #error "This adaptor code is only tested with AOTriton >= 0.9" #endif -#if (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR) >= 10 -#define V3_API 1 -#else -#define V3_API 0 -#endif - namespace pytorch_flash { namespace { @@ -93,15 +86,15 @@ calculate_swa(std::optional window_size_left, int max_seqlen_q, int max_seqlen_k, bool is_causal) { -#if V3_API // SWA is exposed through V3 API +#if AOTRITON_V3_API // SWA is exposed through V3 API bool needs_swa = false; using aotriton::v3::flash::WindowValue; // Default values when std::optional window_size_left/right have no value int window_left = max_seqlen_q; int window_right = max_seqlen_k; if (is_causal) { - window_left = WindowValue::TopLeftAligned; - window_right = WindowValue::TopLeftAligned; + window_left = WindowValue::BottomRightAligned; + window_right = WindowValue::BottomRightAligned; } if (window_size_left.has_value() || window_size_right.has_value()) { needs_swa = true; @@ -243,25 +236,27 @@ mha_fwd_aot(const at::Tensor &q, // batch_size x seqlen_q x num_heads x } else { softmax_fa_t = at::empty({ 0, 0, 0, 0 }, opts); } - - at::Tensor atomic_counter; - if (is_causal) { - atomic_counter = at::zeros({1}, opts.dtype(at::kInt)); - } - auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left, window_size_right, seqlen_q, seqlen_k, is_causal); -#if V3_API +#if AOTRITON_V3_API const bool uses_swa = needs_swa; #else - // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be + // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be // optimized out (hopefully). constexpr bool uses_swa = false; #endif + // SWA in AOTriton Kernels is treated as "Generalized Causal masks" + is_causal = is_causal || uses_swa; + + at::Tensor atomic_counter; + if (is_causal) { + atomic_counter = at::zeros({1}, opts.dtype(at::kInt)); + } + hipError_t err; // TODO: Error handling using aotriton::v2::flash::attn_fwd; using sdp::aotriton_adapter::mk_aotensor; @@ -276,8 +271,8 @@ mha_fwd_aot(const at::Tensor &q, // batch_size x seqlen_q x num_heads x auto seed_output = mk_philoxtensor(use_philox_state ? seed_t.data_ptr() : nullptr); auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr() : nullptr); auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr() : nullptr); - if (uses_swa) { -#if V3_API + if (uses_swa || AOTRITON_ALWAYS_V3_API) { +#if AOTRITON_V3_API using aotriton::v3::flash::CausalType; using aotriton::v3::flash::VarlenType; aotriton::v3::flash::attn_fwd_params params; @@ -297,7 +292,7 @@ mha_fwd_aot(const at::Tensor &q, // batch_size x seqlen_q x num_heads x params.philox_offset_output = offset_output; params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax"); params.persistent_atomic_counter = persistent_counter; - params.causal_type = CausalType::WindowedAttention; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; params.varlen_type = VarlenType::None; params.window_left = window_left; params.window_right = window_right; @@ -447,14 +442,17 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot max_seqlen_q, max_seqlen_k, is_causal); -#if V3_API +#if AOTRITON_V3_API const bool uses_swa = needs_swa; #else - // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be + // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be // optimized out (hopefully). constexpr bool uses_swa = false; #endif + // SWA in AOTriton Kernels is treated as "Generalized Causal masks" + is_causal = is_causal || needs_swa; + auto [seed_t, offset_t, philox_state, use_philox_state] = prepare_philox_arguments(p_dropout, batch_size * num_heads * 32); @@ -477,8 +475,8 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr()) : nullscalar; auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr()) : nullscalar; auto persistent_counter = is_causal ? mk_philoxtensor(atomic_counter.data_ptr()) : nullscalar; - if (uses_swa) { -#if V3_API + if (uses_swa || AOTRITON_ALWAYS_V3_API) { +#if AOTRITON_V3_API using aotriton::v3::flash::CausalType; using aotriton::v3::flash::VarlenType; aotriton::v3::flash::attn_fwd_params params; @@ -500,7 +498,7 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot params.philox_offset_output = offset_output; params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax"); params.persistent_atomic_counter = persistent_counter; - params.causal_type = CausalType::WindowedAttention; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; params.varlen_type = VarlenType::CompactVarlen; params.window_left = window_left; params.window_right = window_right; @@ -594,10 +592,6 @@ mha_bwd_aot(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x hea const int seqlen_k = k.size(1); const int num_heads_k = k.size(2); - if (is_causal){ - TORCH_CHECK((seqlen_q == seqlen_k), "For backwards kernel seqlen_q must equal seqlen_k for causal kernels"); - } - TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!"); @@ -649,10 +643,10 @@ mha_bwd_aot(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x hea seqlen_q, seqlen_k, is_causal); -#if V3_API +#if AOTRITON_V3_API const bool uses_swa = needs_swa; #else - // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be + // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be // optimized out (hopefully). constexpr bool uses_swa = false; #endif @@ -676,10 +670,9 @@ mha_bwd_aot(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x hea hipError_t err; // TODO: Error handling using sdp::aotriton_adapter::mk_aotensor; using sdp::aotriton_adapter::mk_aoscalartensor; - if (uses_swa) { -#if V3_API + if (uses_swa || AOTRITON_ALWAYS_V3_API) { +#if AOTRITON_V3_API // Fused BWD does not support SWA - at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); using aotriton::v3::flash::CausalType; using aotriton::v3::flash::VarlenType; aotriton::v3::flash::attn_bwd_params params; @@ -689,21 +682,32 @@ mha_bwd_aot(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x hea params.Sm_scale = softmax_scale; params.Out = mk_aotensor(out_t, "out"); params.DO = mk_aotensor(dout_t, "dout"); - params.DK = mk_aotensor(dq_t, "dq"); - params.DV = mk_aotensor(dk_t, "dk"); - params.DQ = mk_aotensor(dv_t, "dv"); + params.DQ = mk_aotensor(dq_t, "dq"); + params.DK = mk_aotensor(dk_t, "dk"); + params.DV = mk_aotensor(dv_t, "dv"); params.L = mk_aotensor<2>(softmax_lse_cont, "L"); - params.D = mk_aotensor<2>(delta, "delta"); params.Max_seqlen_q = seqlen_q; // Unused if cu_seqlens_q is empty params.Max_seqlen_k = seqlen_k; // Unused if cu_seqlens_k is empty params.dropout_p = p_dropout; params.philox_seed_ptr = mk_aoscalartensor(philox_seed); params.philox_offset1 = mk_aoscalartensor(philox_offset); params.philox_offset2 = 0; - params.causal_type = CausalType::WindowedAttention; - params.varlen_type = VarlenType::None; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; params.window_left = window_left; params.window_right = window_right; + params.varlen_type = VarlenType::None; +#if AOTRITON_ALWAYS_V3_API + using sdp::aotriton_adapter::mklazy_empty_like; + using sdp::aotriton_adapter::mklazy_fp32zeros; + using sdp::aotriton_adapter::LazyTensorContext; + LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" }; + LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" }; + params.D = mklazy_empty_like<2>(&lazy_delta); + params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc); +#else + at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); + params.D = mk_aotensor<2>(delta, "delta"); +#endif err = aotriton::v3::flash::attn_bwd(params, aotriton::v3::flash::attn_bwd_params::kVersion, stream); @@ -838,7 +842,6 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size CHECK_SHAPE(cu_seqlens_k, batch_size + 1); at::Tensor softmax_lse_cont = softmax_lse.view({batch_size * num_heads, max_seqlen_q}).contiguous(); - at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); at::Tensor q_padded, k_padded, v_padded; q_padded = q.unsqueeze(0).transpose(1, 2); @@ -896,10 +899,10 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size max_seqlen_q, max_seqlen_k, is_causal); -#if V3_API +#if AOTRITON_V3_API const bool uses_swa = needs_swa; #else - // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be + // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be // optimized out (hopefully). constexpr bool uses_swa = false; #endif @@ -919,8 +922,8 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size hipError_t err; // TODO: Error handling using sdp::aotriton_adapter::mk_aotensor; using sdp::aotriton_adapter::mk_aoscalartensor; - if (uses_swa) { -#if V3_API + if (uses_swa || AOTRITON_ALWAYS_V3_API) { +#if AOTRITON_V3_API using aotriton::v3::flash::CausalType; using aotriton::v3::flash::VarlenType; aotriton::v3::flash::attn_bwd_params params; @@ -930,11 +933,10 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size params.Sm_scale = softmax_scale; params.Out = mk_aotensor(out_t, "out"); params.DO = mk_aotensor(dout_t, "dout"); - params.DK = mk_aotensor(dq_padded, "dq"); - params.DV = mk_aotensor(dk_padded, "dk"); - params.DQ = mk_aotensor(dv_padded, "dv"); + params.DK = mk_aotensor(dk_padded, "dk"); + params.DV = mk_aotensor(dv_padded, "dv"); + params.DQ = mk_aotensor(dq_padded, "dq"); params.L = mk_aotensor<2>(softmax_lse_cont, "L"); - params.D = mk_aotensor<2>(delta, "delta"); params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q, "cu_seqlens_q"); params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k, "cu_seqlens_k"); params.Max_seqlen_q = max_seqlen_q; // Unused if cu_seqlens_q is empty @@ -943,17 +945,30 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size params.philox_seed_ptr = mk_aoscalartensor(philox_seed); params.philox_offset1 = mk_aoscalartensor(philox_offset); params.philox_offset2 = 0; - params.causal_type = CausalType::WindowedAttention; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; params.varlen_type = VarlenType::CompactVarlen; params.window_left = window_left; params.window_right = window_right; +#if AOTRITON_ALWAYS_V3_API + using sdp::aotriton_adapter::mklazy_empty_like; + using sdp::aotriton_adapter::mklazy_fp32zeros; + using sdp::aotriton_adapter::LazyTensorContext; + LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" }; + LazyTensorContext lazy_dq_acc { .like_tensor = dq_padded, .tensor_name = "dq_acc" }; + params.D = mklazy_empty_like<2>(&lazy_delta); + params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc); +#else + at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); + params.D = mk_aotensor<2>(delta, "delta"); +#endif err = aotriton::v3::flash::attn_bwd(params, aotriton::v3::flash::attn_bwd_params::kVersion, stream); -#endif +#endif // AOTRITON_ALWAYS_V3_API } else { using aotriton::v2::flash::attn_bwd_compact_varlen; using sdp::aotriton_adapter::cast_dtype; + at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype())); err = attn_bwd_compact_varlen(mk_aotensor(q_padded, "q"), mk_aotensor(k_padded, "k"), diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index 601ffd2d07525..59669afb93d2f 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -1,7 +1,7 @@ #include #include -#if defined(USE_CK_FLASH_ATTENTION) +#if defined(USE_ROCM_CK_SDPA) namespace pytorch_flash { std::tuple< at::Tensor, // dQ @@ -117,4 +117,4 @@ mem_eff_backward_ck( } } // namespace pytorch_flash -#endif // USE_CK_FLASH_ATTENTION +#endif // USE_ROCM_CK_SDPA diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h index 6fd46467bc076..e92006ef6315c 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h @@ -3,7 +3,7 @@ #include -#if defined(USE_CK_FLASH_ATTENTION) +#if defined(USE_ROCM_CK_SDPA) namespace pytorch_flash { std::tuple< @@ -64,4 +64,4 @@ mem_eff_backward_ck( const at::Tensor philox_offset); } // namespace pytorch_flash -#endif // USE_CK_FLASH_ATTENTION +#endif // USE_ROCM_CK_SDPA diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index fac77821a56c1..d15c5105d0b46 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -1,7 +1,7 @@ #include #include -#if defined(USE_CK_FLASH_ATTENTION) +#if defined(USE_ROCM_CK_SDPA) namespace pytorch_flash { std::tuple< at::Tensor, // output @@ -93,4 +93,4 @@ mem_eff_forward_ck( } } // namespace pytorch_flash -#endif // USE_CK_FLASH_ATTENTION +#endif // USE_ROCM_CK_SDPA diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index 17298aae9485d..f6f2240d4f091 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -147,7 +147,7 @@ std::tuple mha_varlen_bwd_aot( const at::Tensor& philox_seed, const at::Tensor& philox_offset); -#if defined(USE_CK_FLASH_ATTENTION) +#if defined(USE_ROCM_CK_SDPA) // CK implementation TORCH_API std::tuple< @@ -295,7 +295,7 @@ mha_fwd( const float softcap, const bool return_softmax, std::optional gen_) { -#if defined(USE_CK_FLASH_ATTENTION) +#if defined(USE_ROCM_CK_SDPA) if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { const int non_null_window_left = window_size_left.value_or(-1); @@ -368,7 +368,7 @@ mha_varlen_fwd( const float softcap, const bool return_softmax, std::optional gen_) { -#if defined(USE_CK_FLASH_ATTENTION) +#if defined(USE_ROCM_CK_SDPA) if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { std::optional dummy_attn_bias = std::nullopt; @@ -441,9 +441,10 @@ inline std::tuple mha_bwd( const bool deterministic, const at::Tensor philox_seed, const at::Tensor philox_offset) { + +#if defined(USE_ROCM_CK_SDPA) if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { -#if defined(USE_CK_FLASH_ATTENTION) std::optional non_null_dbias = std::nullopt; const int non_null_window_left = window_size_left.value_or(-1); const int non_null_window_right = window_size_right.value_or(-1); @@ -474,10 +475,8 @@ inline std::tuple mha_bwd( philox_offset); // for FA return [dQ, dV, dK, dSoftmax] return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue), std::move(dSoftmax)); -#else - TORCH_WARN_ONCE("Warning! You have opted to use CK flash attention backend in a build that was not compiled using USE_CK_FLASH_ATTENTION=1. Please set this variable and try again. Defaulting to use aotriton backend..."); -#endif } +#endif return mha_bwd_aot( dout, q, @@ -530,7 +529,7 @@ inline std::tuple mha_varlen_bwd const bool deterministic, const at::Tensor philox_seed, const at::Tensor philox_offset) { -#if defined(USE_CK_FLASH_ATTENTION) +#if defined(USE_ROCM_CK_SDPA) if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { std::optional non_null_dbias = std::nullopt; diff --git a/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h new file mode 100644 index 0000000000000..c18744afc1ffc --- /dev/null +++ b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// This file is a trimmed version of cuda/mem_eff_attention/gemm_kernel_utils.h +#pragma once + +#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK(TENSOR.is_contiguous()); + +#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK( \ + TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous"); + +#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \ + TORCH_CHECK( \ + uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned") + +#define ASSIGN_CHECK_OVERFLOW(A, B) \ + { \ + A = B; \ + TORCH_CHECK( \ + B < std::numeric_limits::max(), #B " overflows"); \ + } diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu index 236753c94d37b..1b3ed4dc4ac42 100644 --- a/aten/src/ATen/test/cuda_vectorized_test.cu +++ b/aten/src/ATen/test/cuda_vectorized_test.cu @@ -10,8 +10,13 @@ using namespace at::native::memory; constexpr int buffer_size = 1024; +#if defined(CUDA_VERSION) && CUDA_VERSION < 13000 __managed__ double4 buffer1[buffer_size]; __managed__ double4 buffer2[buffer_size]; +#else +__managed__ double4_16a buffer1[buffer_size]; +__managed__ double4_16a buffer2[buffer_size]; +#endif void reset_buffers() { for (int i = 0; i < buffer_size; i++) { diff --git a/aten/src/ATen/test/thread_init_test.cpp b/aten/src/ATen/test/thread_init_test.cpp index 7ad7a18e9c660..60dd52d1dffcb 100644 --- a/aten/src/ATen/test/thread_init_test.cpp +++ b/aten/src/ATen/test/thread_init_test.cpp @@ -1,7 +1,8 @@ +#include + #include #include #include -#include #include @@ -9,7 +10,7 @@ // numbers of threads set and also whether the scheduler // will throw an exception when multiple threads call // their first parallel construct. -void test(int given_num_threads) { +static void test(int given_num_threads) { auto t = at::ones({1000 * 1000}, at::CPU(at::kFloat)); ASSERT_TRUE(given_num_threads >= 0); ASSERT_EQ(at::get_num_threads(), given_num_threads); @@ -19,7 +20,7 @@ void test(int given_num_threads) { } } -int main() { +TEST(ThreadInitTest, ThreadInit) { at::init_num_threads(); at::set_num_threads(4); @@ -32,13 +33,11 @@ int main() { #if !AT_PARALLEL_NATIVE at::set_num_threads(5); - ASSERT_TRUE(at::get_num_threads() == 5); + ASSERT_EQ(at::get_num_threads(), 5); #endif // test inter-op settings at::set_num_interop_threads(5); ASSERT_EQ(at::get_num_interop_threads(), 5); ASSERT_ANY_THROW(at::set_num_interop_threads(6)); - - return 0; } diff --git a/aten/src/ATen/xpu/CachingHostAllocator.cpp b/aten/src/ATen/xpu/CachingHostAllocator.cpp index 1255285d25af0..d531b46c3c554 100644 --- a/aten/src/ATen/xpu/CachingHostAllocator.cpp +++ b/aten/src/ATen/xpu/CachingHostAllocator.cpp @@ -30,6 +30,12 @@ struct XPUCachingHostAllocatorImpl bool query_event(XPUEvent& event) override { return event.query(); } + + bool pinned_use_background_threads() override { + // Using background threads for XPU causes a hang on Windows during program + // exit. Will be enabled once the issue is resolved. + return false; + } }; DECLARE_HOST_ALLOCATOR( diff --git a/benchmarks/data/dataloader_benchmark.py b/benchmarks/data/dataloader_benchmark.py new file mode 100644 index 0000000000000..7d1dd3afc7e98 --- /dev/null +++ b/benchmarks/data/dataloader_benchmark.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +""" +Benchmark script for PyTorch DataLoader with different worker methods. + +This script measures: +1. Dataloader initialization time +2. Dataloading speed (time per batch) +3. CPU memory utilization + +Usage: + python dataloader_benchmark.py --data_path /path/to/dataset --batch_size 32 --num_workers 4 +""" + +import argparse +import copy +import gc +import time + +import psutil +import torchvision +import torchvision.transforms as transforms +from torchvision.models import resnet18 + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader +from torch.utils.data.dataset import ConcatDataset + + +def get_memory_usage(): + """ + Get current memory usage in MB. This includes all child processes. + + Returns: + Total memory usage in MB + """ + process = psutil.Process() + + main_memory = process.memory_full_info().pss + + # Add memory usage of all child processes + for child in process.children(recursive=True): + try: + child_mem = child.memory_full_info().pss + main_memory += child_mem + except (psutil.NoSuchProcess, psutil.AccessDenied, AttributeError): + # Process might have terminated or doesn't support PSS, fall back to USS + print(f"Failed to get PSS for {child}, falling back to USS") + child_mem = child.memory_info().uss + main_memory += child_mem + + return main_memory / (1024 * 1024) + + +def print_detailed_memory(): + """Print detailed memory information.""" + process = psutil.Process() + print("\nDetailed memory information:") + try: + print( + f" USS (Unique Set Size): {process.memory_full_info().uss / (1024 * 1024):.2f} MB" + ) + print( + f" PSS (Proportional Set Size): {process.memory_full_info().pss / (1024 * 1024):.2f} MB" + ) + print( + f" RSS (Resident Set Size): {process.memory_info().rss / (1024 * 1024):.2f} MB" + ) + except Exception: + print(" Detailed memory info not available") + + +def create_model(): + """Create a simple model for benchmarking.""" + model = resnet18() + return model + + +def benchmark_dataloader( + dataset, + batch_size, + num_workers, + num_epochs=1, + max_batches=10, + multiprocessing_context=None, + logging_freq=10, +): + """Benchmark a dataloader with specific configuration.""" + print("\n--- Benchmarking DataLoader ---") + + # Clear memory before starting + gc.collect() + torch.cuda.empty_cache() + + # Create model + model = create_model() + + # Measure memory before dataloader creation + memory_before = get_memory_usage() + print(f"Memory before DataLoader creation: {memory_before:.2f} MB") + print_detailed_memory() + + # Measure dataloader initialization time + start = time.perf_counter() + dataloader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=True, + num_workers=num_workers, + pin_memory=torch.cuda.is_available(), + prefetch_factor=2 if num_workers > 0 else None, + multiprocessing_context=multiprocessing_context, + ) + it = iter(dataloader) + dataloader_init_time = time.perf_counter() - start + + # Measure memory after dataloader creation + memory_after = get_memory_usage() + print(f"Memory after DataLoader creation: {memory_after:.2f} MB") + print(f"Memory increase: {memory_after - memory_before:.2f} MB") + + # Create model and optimizer + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = model.to(device) + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + + # Benchmark dataloading speed + model.train() + total_batches = 0 + total_samples = 0 + total_time = 0 + total_data_load_time = 0 + + # Measure peak memory during training + peak_memory = memory_after + + print( + f"\nStarting training loop with {num_epochs} epochs (max {max_batches} batches per epoch)" + ) + + for epoch in range(num_epochs): + while total_batches < max_batches: + batch_start = time.perf_counter() + + try: + inputs, labels = next(it) + except StopIteration: + break + + # Move data to device + inputs = inputs.to(device) + labels = labels.to(device) + + # Capture data fetch time (including sending to device) + data_load_time = time.perf_counter() - batch_start + + # Forward pass + outputs = model(inputs) + loss = criterion(outputs, labels) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Capture batch time + batch_time = time.perf_counter() - batch_start + + total_batches += 1 + total_samples += inputs.size(0) + total_data_load_time += data_load_time + total_time += batch_time + + # Update peak memory and log memory usage periodically + if total_batches % 5 == 0: + # Force garbage collection before measuring memory + gc.collect() + current_memory = get_memory_usage() + + if current_memory > peak_memory: + peak_memory = current_memory + + if total_batches % logging_freq == 0: + print( + f"Epoch {epoch + 1}, Batch {total_batches}, " + f"Time: {batch_time:.4f}s, " + f"Memory: {current_memory:.2f} MB" + ) + + # Calculate statistics + avg_data_load_time = ( + total_data_load_time / total_batches if total_batches > 0 else 0 + ) + avg_batch_time = total_time / total_batches if total_batches > 0 else 0 + samples_per_second = total_samples / total_time if total_time > 0 else 0 + + results = { + "dataloader_init_time": dataloader_init_time, + "num_workers": num_workers, + "batch_size": batch_size, + "total_batches": total_batches, + "avg_batch_time": avg_batch_time, + "avg_data_load_time": avg_data_load_time, + "samples_per_second": samples_per_second, + "peak_memory_mb": peak_memory, + "memory_increase_mb": peak_memory - memory_before, + } + + print("\nResults:") + print(f" DataLoader init time: {dataloader_init_time:.4f} seconds") + print(f" Average data loading time: {avg_data_load_time:.4f} seconds") + print(f" Average batch time: {avg_batch_time:.4f} seconds") + print(f" Samples per second: {samples_per_second:.2f}") + print(f" Peak memory usage: {peak_memory:.2f} MB") + print(f" Memory increase: {peak_memory - memory_before:.2f} MB") + + # Clean up + del model, optimizer + del dataloader + + # Force garbage collection + gc.collect() + torch.cuda.empty_cache() + + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark PyTorch DataLoader with different worker methods" + ) + parser.add_argument("--data_path", required=True, help="Path to dataset") + parser.add_argument("--batch_size", type=int, default=32, help="Batch size") + parser.add_argument("--num_workers", type=int, default=4, help="Number of workers") + parser.add_argument( + "--max_batches", + type=int, + default=100, + help="Maximum number of batches per epoch", + ) + parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs") + parser.add_argument( + "--multiprocessing_context", + choices=["fork", "spawn", "forkserver"], + default="forkserver", + help="Multiprocessing context to use (fork, spawn, forkserver)", + ) + parser.add_argument( + "--dataset_copies", + type=int, + default=1, + help="Number of copies of the dataset to concatenate (for testing memory usage)", + ) + parser.add_argument( + "--logging_freq", + type=int, + default=10, + help="Frequency of logging memory usage during training", + ) + args = parser.parse_args() + + # Print system info + print("System Information:") + # The following are handy for debugging if building from source worked correctly + print(f" PyTorch version: {torch.__version__}") + print(f" PyTorch location: {torch.__file__}") + print(f" Torchvision version: {torchvision.__version__}") + print(f" Torchvision location: {torchvision.__file__}") + print(f" CUDA available: {torch.cuda.is_available()}") + if torch.cuda.is_available(): + print(f" CUDA device: {torch.cuda.get_device_name(0)}") + print(f" CPU count: {psutil.cpu_count(logical=True)}") + print(f" Physical CPU cores: {psutil.cpu_count(logical=False)}") + print(f" Total system memory: {psutil.virtual_memory().total / (1024**3):.2f} GB") + + # Define transforms + transform = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + ) + + # Load dataset + print(f"\nLoading dataset from {args.data_path} ({args.dataset_copies} copies)") + + # Try to load as ImageFolder + datasets = [] + for _ in range(args.dataset_copies): + base_dataset = torchvision.datasets.ImageFolder( + args.data_path, transform=transform + ) + datasets.append(copy.deepcopy(base_dataset)) + del base_dataset + dataset = ConcatDataset(datasets) + + print(f"Dataset size: {len(dataset)}") + + # Run benchmark with specified worker method + benchmark_dataloader( + dataset, + batch_size=args.batch_size, + num_workers=args.num_workers, + multiprocessing_context=args.multiprocessing_context, + num_epochs=args.num_epochs, + max_batches=args.max_batches, + logging_freq=args.logging_freq, + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py index 7fa24ae7346b1..678cee5f752c3 100644 --- a/benchmarks/dynamo/check_accuracy.py +++ b/benchmarks/dynamo/check_accuracy.py @@ -14,6 +14,9 @@ "detectron2_maskrcnn_r_101_c4", "timm_efficientnet", # see https://github.com/pytorch/pytorch/issues/148699 "XGLMForCausalLM", # discovered in https://github.com/pytorch/pytorch/pull/128148 + "moondream", # discovered in https://github.com/pytorch/pytorch/pull/159291 + # discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it + "mobilenetv3_large_100", } diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv index 66e088f334071..f65909f3a24ea 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv index af605accecf6e..1d199fe8ea664 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 @@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv index 33ede2b914b4f..54b7d63f3a4bc 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,23 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv index 1cafcbe55675d..169a42ff7cd41 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv @@ -42,14 +42,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -154,10 +146,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv index 1cafcbe55675d..169a42ff7cd41 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv @@ -42,14 +42,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -154,10 +146,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv index c889ba0e8d2f7..c7d283b9aa52d 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv @@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,0 -dla102,pass,0 +dla102,timeout,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv index faafea393ede5..e68aa2fa5351f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv @@ -138,7 +138,7 @@ hf_Bert_large,pass,0 -hf_BigBird,pass,24 +hf_BigBird,pass,25 @@ -158,7 +158,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -346,7 +346,7 @@ vgg16,pass,0 -vision_maskrcnn,fail_accuracy,30 +vision_maskrcnn,fail_accuracy,29 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv index c889ba0e8d2f7..c7d283b9aa52d 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv @@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,0 -dla102,pass,0 +dla102,timeout,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv index a2b7c1a7b15ca..aec659fdcd654 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv @@ -138,7 +138,7 @@ hf_Bert_large,pass,0 -hf_BigBird,pass,24 +hf_BigBird,pass,25 @@ -158,7 +158,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,5 +hf_Reformer,pass,8 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv index 697fe04cd91a5..4f2eec1493520 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv @@ -138,7 +138,7 @@ hf_Bert_large,pass,0 -hf_BigBird,pass,24 +hf_BigBird,pass,25 @@ -158,7 +158,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,5 +hf_Reformer,pass,8 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv index 66e088f334071..f65909f3a24ea 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv index 7f11e13980273..20cad351b1275 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 @@ -146,7 +146,7 @@ hf_Bert_large,pass,0 -hf_BigBird,fail_to_run,0 +hf_BigBird,pass,0 @@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv index cb8cead2ba034..5050b3762ed96 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv @@ -2,7 +2,7 @@ name,accuracy,graph_breaks -torchrec_dlrm,fail_to_run,3 +torchrec_dlrm,pass,6 @@ -94,7 +94,7 @@ hf_Bert_large,pass,6 -hf_BigBird,fail_to_run,3 +hf_BigBird,pass,6 @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,fail_to_run,19 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv index c251f34c0e944..b0e8f34b964ec 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv @@ -34,7 +34,7 @@ basic_gnn_gin,pass,0 -basic_gnn_sage,fail_to_run,0 +basic_gnn_sage,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv index 6f9e9e0ed5a7b..c8db4d5823203 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv @@ -122,7 +122,7 @@ hf_Bert_large,pass,0 -hf_BigBird,pass,24 +hf_BigBird,pass,25 @@ -142,7 +142,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,5 +hf_Reformer,pass,8 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv index 4f7ca2b638c48..f4c9ffddd9974 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv @@ -138,7 +138,7 @@ hf_Bert_large,pass,0 -hf_BigBird,pass,24 +hf_BigBird,pass,25 @@ -158,7 +158,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,5 +hf_Reformer,pass,8 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv index 66e088f334071..f65909f3a24ea 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv index 7f11e13980273..2b2c1a504647f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 @@ -146,7 +146,7 @@ hf_Bert_large,pass,0 -hf_BigBird,fail_to_run,0 +hf_BigBird,fail_accuracy,0 @@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv index 05eb7e3546eef..89871fd49a04b 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv @@ -2,7 +2,7 @@ name,accuracy,graph_breaks -torchrec_dlrm,fail_to_run,3 +torchrec_dlrm,pass,6 @@ -46,7 +46,7 @@ dcgan,pass,6 -demucs,fail_to_run,4 +demucs,pass,9 @@ -94,7 +94,7 @@ hf_Bert_large,pass,6 -hf_BigBird,fail_to_run,3 +hf_BigBird,pass,6 @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,fail_to_run,19 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv index 66e088f334071..f65909f3a24ea 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv index af605accecf6e..1d199fe8ea664 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 @@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv index 44983e8ecc214..0985e42fc5cb9 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,23 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv index 66e088f334071..f65909f3a24ea 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv index 9a9a68629f875..e41018657c0e2 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 @@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv index 33ede2b914b4f..54b7d63f3a4bc 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,23 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv index 9fdb41506e3b2..08061de428d71 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv index b3a3265baa16f..6f316b219bb92 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv @@ -166,7 +166,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -181,7 +181,7 @@ hf_T5_base,pass,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv index d2300bdac05b8..48d0b111788f7 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -114,7 +114,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,23 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv index 1cafcbe55675d..ce334e22c698b 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv @@ -42,14 +42,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -66,7 +58,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -154,10 +146,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv index 9fdb41506e3b2..08061de428d71 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv index 624f295624783..4b5138ce9c367 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv @@ -166,7 +166,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -181,7 +181,7 @@ hf_T5_base,pass,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv index 1605a26b7ce5f..643a02fdca8fd 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -114,7 +114,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,23 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv index 66e088f334071..f65909f3a24ea 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv index 6776cc5f5d7a7..a3fc7cf192371 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv @@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -174,7 +174,7 @@ hf_T5_base,eager_fail_to_run,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv index b43e38b7d822a..ced88884720b7 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,23 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv index 9fdb41506e3b2..08061de428d71 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv index b3a3265baa16f..6f316b219bb92 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv @@ -166,7 +166,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -181,7 +181,7 @@ hf_T5_base,pass,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv index 754f5f718e436..d1606b622639e 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -114,7 +114,7 @@ hf_Longformer,pass,4 -hf_Reformer,pass,23 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv index fd57a3b4cbf3c..0f088e7892d8f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv @@ -46,14 +46,6 @@ CamemBert,pass,0 -DebertaForMaskedLM,pass,0 - - - -DebertaForQuestionAnswering,pass,0 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0 -DistillGPT2,pass,0 +DistillGPT2,pass,2 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0 -Speech2Text2ForCausalLM,pass,0 - - - T5ForConditionalGeneration,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv index 66e088f334071..f65909f3a24ea 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv @@ -46,14 +46,6 @@ CamemBert,pass,5 -DebertaForMaskedLM,pass,5 - - - -DebertaForQuestionAnswering,pass,5 - - - DebertaV2ForMaskedLM,pass_due_to_skip,0 @@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5 -DistillGPT2,pass,5 +DistillGPT2,pass,7 @@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3 -OPTForCausalLM,pass,6 +OPTForCausalLM,pass,8 @@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5 -Speech2Text2ForCausalLM,pass,6 - - - T5ForConditionalGeneration,pass,5 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv index 3e4e9ee702aa3..8ccf95da9659e 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv @@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,5 +hf_Reformer,pass,8 @@ -174,7 +174,7 @@ hf_T5_base,eager_fail_to_run,0 -hf_T5_generate,pass,3 +hf_T5_generate,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv index 86ad955b5a2cb..e842ac7cb8e1f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv @@ -102,7 +102,7 @@ hf_DistilBert,pass,6 -hf_GPT2,pass,6 +hf_GPT2,pass,8 @@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,pass,23 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 516549d7f6569..2901009f7c4d1 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -21,6 +21,7 @@ import signal import subprocess import sys +import tempfile import time import weakref from contextlib import contextmanager @@ -41,6 +42,7 @@ import torch.distributed import torch.multiprocessing as mp from torch._C import _has_cuda as HAS_CUDA, _has_xpu as HAS_XPU +from torch._C._nativert import PyModelRunner from torch._dynamo.profiler import fx_insert_profiling, Profiler from torch._dynamo.testing import ( dummy_fx_compile, @@ -202,7 +204,6 @@ class CI(NamedTuple): "PLBartForCausalLM", "PLBartForConditionalGeneration", "PegasusForCausalLM", - "Speech2Text2ForCausalLM", "TrOCRForCausalLM", "XGLMForCausalLM", # TIMM @@ -1100,6 +1101,10 @@ def maybe_mark_profile(*args, **kwargs): frozen_model_iter_fn = export_aot_inductor( model, example_inputs, args.inductor_compile_mode ) + elif args.export_nativert: + frozen_model_iter_fn = export_nativert(model, example_inputs) + elif args.torchscript_jit_trace: + frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs) else: frozen_model_iter_fn = torch._dynamo.run(model_iter_fn) @@ -1446,6 +1451,60 @@ def get_excess_memory(cls, model) -> float: return cls.cache.get(weakref.ref(model), (None, 0.0))[1] +class NativeRTCache: + cache: dict[weakref.ref, Any] = {} + + @classmethod + def load(cls, model, example_inputs): + from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path + + key = weakref.ref(model) + if key not in cls.cache: + example_args, example_kwargs = _normalize_bench_inputs(example_inputs) + example_outputs = model(*example_args, **example_kwargs) + _register_dataclass_output_as_pytree(example_outputs) + + combined_args = _combine_args(model, example_args, example_kwargs) + dynamic_shapes = _tree_map_with_path( + _produce_dynamic_shapes_for_export, combined_args + ) + + ep = torch.export.export( + model, example_args, example_kwargs, dynamic_shapes=dynamic_shapes + ) + ep = ep.run_decompositions({}) + with tempfile.NamedTemporaryFile(delete=False) as f: + torch.export.pt2_archive._package.package_pt2( + f, exported_programs={"forward": ep} + ) + filename = f.name + cls.cache[key] = PyModelRunner(filename, "forward") + + return cls.cache[key] + + +class JitTracedCache: + cache: dict[weakref.ref, Any] = {} + + @classmethod + def load(cls, model, example_inputs): + key = weakref.ref(model) + if key not in cls.cache: + example_args, example_kwargs = _normalize_bench_inputs(example_inputs) + if example_args: + jit_traced_module = torch.jit.trace( + model, example_inputs=example_args, strict=False + ) + else: + jit_traced_module = torch.jit.trace( + model, example_kwarg_inputs=example_kwargs, strict=False + ) + + cls.cache[key] = jit_traced_module + + return cls.cache[key] + + def export(model, example_inputs): from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path @@ -1472,6 +1531,16 @@ def opt_export(_, example_inputs): return opt_export +def export_nativert(model, example_inputs): + optimized = NativeRTCache.load(model, example_inputs) + + def opt_nativert(_, example_inputs, collect_outputs=False): + example_args, example_kwargs = _normalize_bench_inputs(example_inputs) + return optimized.run(*example_args, **example_kwargs) + + return opt_nativert + + def export_aot_inductor(model, example_inputs, mode): optimized = AOTInductorModelCache.load(model, example_inputs, mode) @@ -1482,6 +1551,16 @@ def opt_aot_inductor(_, example_inputs, collect_outputs=False): return opt_aot_inductor +def torchscript_jit_trace(model, example_inputs): + optimized = JitTracedCache.load(model, example_inputs) + + def opt_jit_trace(_, example_inputs, collect_outputs=False): + example_args, example_kwargs = _normalize_bench_inputs(example_inputs) + return optimized(*example_args, **example_kwargs) + + return opt_jit_trace + + def download_retry_decorator(download_fn): """ Decorator function for applying retry logic to a download function. @@ -2228,7 +2307,12 @@ def record_status(accuracy_status, dynamo_start_stats): try: model_copy = self.deepcopy_and_maybe_parallelize(model) self.init_optimizer(name, current_device, model_copy.parameters()) - if self.args.export or self.args.export_aot_inductor: + if ( + self.args.export + or self.args.export_aot_inductor + or self.args.export_nativert + or self.args.torchscript_jit_trace + ): # apply export on module directly # no need for n iterations # the logic should be the same to self.model_iter_fn (forward_pass) @@ -2624,7 +2708,11 @@ def warmup(fn, model, example_inputs, mode, niters=5): niters=1, ) - if self.args.export_aot_inductor: + if ( + self.args.export_aot_inductor + or self.args.export_nativert + or self.args.torchscript_jit_trace + ): optimized_model_iter_fn = optimize_ctx else: optimized_model_iter_fn = optimize_ctx(self.model_iter_fn) @@ -3377,6 +3465,16 @@ def get_example_inputs(self): action="store_true", help="Measure pass rate with Export+AOTInductor", ) + group.add_argument( + "--export-nativert", + action="store_true", + help="Measure pass rate with Export+NativeRT", + ) + group.add_argument( + "--torchscript-jit-trace", + action="store_true", + help="Measure pass rate with TorchScript jit.trace", + ) group.add_argument( "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch" ) @@ -3818,6 +3916,14 @@ def run(runner, args, original_dir=None): optimize_ctx = export experiment = speedup_experiment output_filename = "export.csv" + elif args.export_nativert: + optimize_ctx = export_nativert + experiment = speedup_experiment + output_filename = "export_nativert.csv" + elif args.torchscript_jit_trace: + optimize_ctx = torchscript_jit_trace + experiment = speedup_experiment + output_filename = "torchscript_jit_trace.csv" elif args.xla: (dev,) = args.devices os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev] @@ -4132,7 +4238,7 @@ def detect_and_mark_batch(t): nonlocal marked for i, s in enumerate(t.size()): if s == batch_size: - torch._dynamo.mark_dynamic(t, i) + torch._dynamo.maybe_mark_dynamic(t, i) marked = True break diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py index 916a33276d996..76026731fe890 100755 --- a/benchmarks/dynamo/huggingface.py +++ b/benchmarks/dynamo/huggingface.py @@ -370,6 +370,7 @@ def use_larger_multiplier_for_smaller_tensor(self, name): return name in [ "ElectraForQuestionAnswering", "MegatronBertForQuestionAnswering", + "GPT2ForSequenceClassification", ] def _get_model_cls_and_config(self, model_name): @@ -459,6 +460,12 @@ def load_model( else: model.eval() + # Turning off kv cache for torchbench models. This is not the right + # thing to do, but the pt2 dashboard is outdated. Real transformers + # benchmarks will be added soon using a different infra. + if hasattr(model, "config") and hasattr(model.config, "use_cache"): + model.config.use_cache = False + self.validate_model(model, example_inputs) return device, model_name, model, example_inputs, batch_size diff --git a/benchmarks/dynamo/huggingface.yaml b/benchmarks/dynamo/huggingface.yaml index f0ee57a589657..5640776117096 100644 --- a/benchmarks/dynamo/huggingface.yaml +++ b/benchmarks/dynamo/huggingface.yaml @@ -31,8 +31,6 @@ batch_size: BlenderbotSmallForCausalLM: 4 BlenderbotSmallForConditionalGeneration: 2 CamemBert: 2 - DebertaForMaskedLM: 4 - DebertaForQuestionAnswering: 2 DebertaV2ForMaskedLM: 4 DebertaV2ForQuestionAnswering: 8 DistilBertForMaskedLM: 2 @@ -63,7 +61,6 @@ batch_size: PegasusForConditionalGeneration: 2 RobertaForCausalLM: 2 RobertaForQuestionAnswering: 2 - Speech2Text2ForCausalLM: 4 T5ForConditionalGeneration: 2 T5Small: 2 TrOCRForCausalLM: 2 diff --git a/benchmarks/dynamo/huggingface_models_list.txt b/benchmarks/dynamo/huggingface_models_list.txt index 6e3cf19a783d7..12ceedd5c4ccc 100644 --- a/benchmarks/dynamo/huggingface_models_list.txt +++ b/benchmarks/dynamo/huggingface_models_list.txt @@ -10,8 +10,6 @@ BlenderbotForConditionalGeneration,16 BlenderbotSmallForCausalLM,256 BlenderbotSmallForConditionalGeneration,128 CamemBert,32 -DebertaForMaskedLM,32 -DebertaForQuestionAnswering,32 DebertaV2ForMaskedLM,8 DebertaV2ForQuestionAnswering,8 DistilBertForMaskedLM,256 @@ -42,7 +40,6 @@ PegasusForCausalLM,128 PegasusForConditionalGeneration,64 RobertaForCausalLM,32 RobertaForQuestionAnswering,32 -Speech2Text2ForCausalLM,1024 T5ForConditionalGeneration,8 T5Small,8 TrOCRForCausalLM,64 diff --git a/benchmarks/dynamo/huggingface_models_list_cpu.txt b/benchmarks/dynamo/huggingface_models_list_cpu.txt index cabd79ac830f6..4078368a69c44 100644 --- a/benchmarks/dynamo/huggingface_models_list_cpu.txt +++ b/benchmarks/dynamo/huggingface_models_list_cpu.txt @@ -10,8 +10,6 @@ BlenderbotForCausalLM,32 BlenderbotSmallForCausalLM,64 BlenderbotSmallForConditionalGeneration,64 CamemBert,16 -DebertaForMaskedLM,32 -DebertaForQuestionAnswering,8 DebertaV2ForMaskedLM,16 DebertaV2ForQuestionAnswering,2 DistilBertForMaskedLM,128 @@ -38,7 +36,6 @@ PLBartForCausalLM,8 PLBartForConditionalGeneration,4 RobertaForCausalLM,16 RobertaForQuestionAnswering,16 -Speech2Text2ForCausalLM,32 T5ForConditionalGeneration,4 T5Small,1 TrOCRForCausalLM,32 diff --git a/benchmarks/dynamo/pr_time_benchmarks/check_results.py b/benchmarks/dynamo/pr_time_benchmarks/check_results.py index f9204ee98fb05..734d3a01c1e82 100644 --- a/benchmarks/dynamo/pr_time_benchmarks/check_results.py +++ b/benchmarks/dynamo/pr_time_benchmarks/check_results.py @@ -132,10 +132,10 @@ def log(event_name): ) new_entry = copy.deepcopy(entry) - # only change if abs(ratio) > entry.noise_margin /3. + # only change if abs(ratio) > entry.noise_margin /5. new_entry.expected_value = ( replace_with_zeros(result) - if abs(ratio) > entry.noise_margin * 100 / 3 + if abs(ratio) > entry.noise_margin * 100 / 5 else entry.expected_value ) new_expected[key] = new_entry diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv index 5398c40f3573a..fc11be9ba6528 100644 --- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv +++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv @@ -18,7 +18,7 @@ add_loop_inductor_gpu,compile_time_instruction_count,26800000000,0.1 -basic_modules_ListOfLinears_eager,compile_time_instruction_count,1009000000,0.1 +basic_modules_ListOfLinears_eager,compile_time_instruction_count,1048000000,0.1 @@ -74,15 +74,15 @@ aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0 -mm_loop_inductor_gpu,compile_time_instruction_count,4461000000,0.1 +mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1 -mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8417000000,0.1 +mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1 -basic_NestedModule_eager,compile_time_instruction_count,8787000000,0.1 +basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1 diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py index c2568aa1daa19..1f10ecc661d8e 100755 --- a/benchmarks/dynamo/torchbench.py +++ b/benchmarks/dynamo/torchbench.py @@ -382,6 +382,22 @@ def load_model( if self.args.trace_on_xla: # work around for: https://github.com/pytorch/xla/issues/4174 import torch_xla # noqa: F401 + + # Turning off kv cache for torchbench models. This is not the right + # thing to do, but the torchbench models are way outdated, and since we + # are using torchbench pt2 dashboard to track regressions (rather than + # improving performance), we are just setting the kv cache to false. + # Real transformers benchmarks will be added soon using a different + # infra. + if ( + model_name.startswith("hf") + and hasattr(model, "config") + and hasattr(model.config, "use_cache") + ): + model.config.use_cache = False + if model_name == "hf_T5_generate": + model.model.config.use_cache = False + self.validate_model(model, example_inputs) return device, benchmark.name, model, example_inputs, batch_size diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml index bf0a1b6c31e85..6a15cf33222b2 100644 --- a/benchmarks/dynamo/torchbench.yaml +++ b/benchmarks/dynamo/torchbench.yaml @@ -219,7 +219,9 @@ skip: - timm_regnet - timm_nfnet - cuda: [] + cuda: + # Temporary until https://github.com/pytorch/pytorch/issues/162282 is fixed + - sam_fast test: training: diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py index cb836bb5eaa4b..0b7fcf4e555f8 100644 --- a/benchmarks/operator_benchmark/benchmark_core.py +++ b/benchmarks/operator_benchmark/benchmark_core.py @@ -4,6 +4,7 @@ import functools import json import os +import platform import timeit from collections import namedtuple from dataclasses import asdict, dataclass @@ -191,6 +192,11 @@ def __init__(self, args): self.predefined_minimum_secs = 1 self.max_iters = 1e6 self.use_jit = args.use_jit + self.use_compile = args.use_compile + if self.use_jit and self.use_compile: + raise ValueError( + "use_jit and use_compile are mutually exclusive, please specify one." + ) self.num_runs = args.num_runs self.print_per_iter = False self.output_csv = args.output_csv @@ -222,7 +228,7 @@ def _print_header(self): if self.args.operators: print(f"# {self.args.operators}") - def _print_perf_result(self, reported_run_time_us, test_case): + def _print_perf_result(self, results, test_case): if self.args.report_aibench: # Output for AIBench # Print out per iteration execution time instead of avg time @@ -236,12 +242,14 @@ def _print_perf_result(self, reported_run_time_us, test_case): "type": test_name, "metric": "latency", "unit": "us", - "value": str(reported_run_time_us[run]), + "value": str(results["reported_run_time_us"[run]]), } ) ) else: - print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}") + print( + f"# Mode: {'JIT' if self.use_jit else 'Compile' if self.use_compile else 'Eager'}" + ) print( f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}" ) @@ -250,25 +258,33 @@ def _print_perf_result(self, reported_run_time_us, test_case): if self.num_runs > 1: for run in range(self.num_runs): print( - f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}" + f"Run: {run}, {mode} Execution Time (us) : {results['reported_run_time_us'][run]:.3f}" ) print() else: - print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n") + print( + f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}" + ) + print(f"Peak Memory (KB) : {results['peak_memory']}\n") - def _perf_result_to_dict(self, reported_run_time_us, test_case): + def _perf_result_to_dict(self, results, test_case): """This function is the parallel of _print_perf_result, which instead of writing information to terminal, returns a dictionary. """ if self.args.report_aibench: return {} + out = { "test_name": test_case.test_config.test_name, "input_config": test_case.test_config.input_config, - "mode": "JIT" if self.use_jit else "Eager", + "runtime": ( + "JIT" if self.use_jit else "Compile" if self.use_compile else "Eager" + ), "run": "Backward" if test_case.test_config.run_backward else "Forward", - "latency": round(reported_run_time_us[0], 3), + "latency": round(results["reported_run_time_us"][0], 3), "latency unit": "us", + "peak memory": results["peak_memory"], + "memory unit": "KB", } # parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary @@ -330,6 +346,8 @@ def _launch_forward(self, test_case, iters, print_per_iter): func = test_case.run_forward if self.use_jit: func = test_case.run_jit_forward + if self.use_compile: + func = test_case.run_compile_forward forward_time = timeit.timeit( functools.partial(func, iters, print_per_iter, cuda_sync), number=1 ) @@ -346,7 +364,7 @@ def _launch_backward(self, test_case, iters, print_per_iter=False): ) return backward_time - def _measure_time(self, launch_test, test_case, iters, print_per_iter): + def _measure_metrics(self, launch_test, test_case, iters, print_per_iter): """ This function execute the operator for iterations then look at the time. If it's not significant, the number of iterations will be increased before rerun. @@ -354,8 +372,20 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter): """ curr_test_total_time = 0 time_trace = [] + peak_memory = 0 + sample_input = next(iter(test_case.op_bench.inputs.values())) + device = sample_input.device + device_module = torch.get_device_module(device.type) + # TODO: add support for cpu memory measurement while True: + if hasattr(device_module, "reset_peak_memory_stats"): + device_module.reset_peak_memory_stats(device) run_time_sec = launch_test(test_case, iters, print_per_iter) + if hasattr(device_module, "synchronize"): + device_module.synchronize(device) + # Memory measurement process + if hasattr(device_module, "max_memory_allocated"): + peak_memory = device_module.max_memory_allocated(device) curr_test_total_time += run_time_sec # Analyze time after each run to decide if the result is stable results_are_significant = self._iteration_result_is_significant( @@ -369,7 +399,13 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter): time_trace.append(report_run_time) # Print out the time spent in each epoch in ms if self.args.report_aibench: - mode = "JIT" if self.use_jit else "Eager" + mode = ( + "JIT" + if self.use_jit + else "Compile" + if self.use_compile + else "Eager" + ) test_name = "_".join( [test_case.framework, test_case.test_config.test_name, mode] ) @@ -381,7 +417,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter): "metric": "latency", "unit": "ms", "value": str(report_run_time / 1e3), - } + }, ) ) if results_are_significant: @@ -391,7 +427,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter): # iteration count, and run the benchmark again... iters = self._predict_num_iter_needed(iters) reported_run_time_us = np.percentile(np.array(time_trace), 50) - return reported_run_time_us + return reported_run_time_us, peak_memory / 1024 def _check_keep(self, test_flag, cmd_flag): return cmd_flag is None or test_flag == cmd_flag @@ -478,6 +514,7 @@ def _output_json( self, perf_list, output_file, + benchmark_name="PyTorch operator benchmark", ): """ Write the result into JSON format, so that it can be uploaded to the benchmark database @@ -495,8 +532,10 @@ def _output_json( input_config = perf_item.get("input_config", "") run_type = perf_item.get("run") latency = perf_item.get("latency", 0) - - dtype = "float32" # default + peak_memory = perf_item.get("peak memory", 0) + device = perf_item.get("device", "unknown") + dtype = perf_item.get("dtype", "torch.float").split(".")[1] + runtime = perf_item.get("runtime", None) # Extract mode based on run_type mode = None @@ -505,6 +544,22 @@ def _output_json( elif run_type == "Backward": mode = "training" + # Extract use_compile from it + if runtime == "Compile": + use_compile = True + elif runtime == "Eager": + use_compile = False + else: + use_compile = None + + device_arch = ( + torch.cuda.get_device_name(0) + if device == "cuda" + else platform.processor() + if device == "cpu" + else "unknown" + ) + # Create the record @dataclass class BenchmarkInfo: @@ -532,12 +587,18 @@ class BenchmarkRecord: model: ModelInfo metric: MetricInfo - record = BenchmarkRecord( + # Add record for latency + record_latency = BenchmarkRecord( benchmark=BenchmarkInfo( - name="PyTorch operator benchmark", + name=benchmark_name, mode=mode, dtype=dtype, - extra_info={"input_config": input_config}, + extra_info={ + "input_config": input_config, + "device": device, + "arch": device_arch, + "use_compile": use_compile, + }, ), model=ModelInfo( name=test_name, type="micro-benchmark", origins=["pytorch"] @@ -549,8 +610,17 @@ class BenchmarkRecord: target_value=None, ), ) - - records.append(asdict(record)) + records.append(asdict(record_latency)) + + # Add record for peak memory + record_memory = copy.deepcopy(record_latency) + record_memory.metric = MetricInfo( + name="peak memory", + unit="KB", + benchmark_values=[peak_memory], + target_value=None, + ) + records.append(asdict(record_memory)) # Write all records to the output file with open(output_file, "w", encoding="utf-8") as f: @@ -566,6 +636,7 @@ def run(self): "tag", "run_backward", "Execution Time", + "Peak Memory (KB)", ] if self.args.output_json or self.args.output_json_for_dashboard: @@ -603,13 +674,16 @@ def run(self): test_case, self.args.warmup_iterations, print_per_iter=False ) # Actual Execution - reported_time = [ - self._measure_time( + results = [ + self._measure_metrics( launch_func, test_case, self.iters, self.print_per_iter ) for _ in range(self.num_runs) ] - self._print_perf_result(reported_time, test_case) + result_dict = dict() + result_dict["reported_run_time_us"] = [r[0] for r in results] + result_dict["peak_memory"] = results[0][1] + self._print_perf_result(results=result_dict, test_case=test_case) # output results to csv self._output_csv( @@ -625,16 +699,17 @@ def run(self): ), test_case.test_config.tag, test_case.test_config.run_backward, - reported_time[0], + result_dict["reported_run_time_us"][0], + result_dict["peak_memory"], ], ) if self.args.output_json or self.args.output_json_for_dashboard: - perf_list.append( - self._perf_result_to_dict(reported_time, test_case) - ) + perf_list.append(self._perf_result_to_dict(result_dict, test_case)) if self.args.output_json_for_dashboard: - self._output_json(perf_list, self.args.output_json_for_dashboard) + self._output_json( + perf_list, self.args.output_json_for_dashboard, self.args.benchmark_name + ) if self.args.output_json: with open(self.args.output_json, "w") as f: diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py index 52ae47047daab..a7ff40ebb340e 100644 --- a/benchmarks/operator_benchmark/benchmark_pytorch.py +++ b/benchmarks/operator_benchmark/benchmark_pytorch.py @@ -4,6 +4,15 @@ import torch +# Import the C++ extension to register the _consume operator +try: + import benchmark_cpp_extension # noqa: F401 +except ImportError as err: + # If the extension isn't built, the script must raise an error + raise ImportError( + "Failed to import C++ extension, please build it using \ncd pt_extension \npython -m pip install ." + ) from err + """PyTorch performance microbenchmarks. This module contains PyTorch-specific functionalities for performance @@ -71,6 +80,16 @@ def forward_consume(self, iters: int): for _ in range(iters): torch.ops.operator_benchmark._consume(self.forward_impl()) + def forward_impl_eager(self): + # This is to supply the inputs to the forward function which + # will be called in both the eager and compile mode of local runs + return self.forward(*self.get_inputs()) + + def forward_consume_eager(self, iters: int): + # Eager version of forward_consume without decorators (compilation handled by torch.compile) + for _ in range(iters): + torch.ops.operator_benchmark._consume(self.forward_impl_eager()) + def module_name(self): """this is used to label the operator being benchmarked""" if self.user_given_name: @@ -117,18 +136,32 @@ def __init__(self, op_bench, test_config): self.framework = "PyTorch" self.time_series = [] self._jit_forward_graph = None + self._compile_forward_graph = None def _generate_jit_forward_graph(self): """generate a graph for the forward function via scripting""" scripted_op_bench = torch.jit.script(self.op_bench) return scripted_op_bench.forward_consume + def _generate_compile_forward_graph(self): + """generate a compiled graph for the forward function via torch.compile""" + compiled_forward_consume = torch.compile( + self.op_bench.forward_consume_eager, backend="inductor" + ) + return compiled_forward_consume + def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False): """Run the forward path of an op with JIT mode""" if self._jit_forward_graph is None: self._jit_forward_graph = self._generate_jit_forward_graph() self._jit_forward_graph(num_runs) + def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False): + """Run the forward path of an op with compile mode""" + if self._compile_forward_graph is None: + self._compile_forward_graph = self._generate_compile_forward_graph() + self._compile_forward_graph(num_runs) + def _print_per_iter(self): # print last 50 values length = min(len(self.time_series), 50) @@ -150,14 +183,14 @@ def run_forward(self, num_runs, print_per_iter, cuda_sync): if print_per_iter: for _ in range(num_runs): start_time = time.time() - self.output = self.op_bench.forward_impl() + self.output = self.op_bench.forward_impl_eager() if cuda_sync: torch.cuda.synchronize(torch.cuda.current_device()) end_time = time.time() self.time_series.append((end_time - start_time) * 1e3) else: for _ in range(num_runs): - self.output = self.op_bench.forward_impl() + self.output = self.op_bench.forward_impl_eager() if cuda_sync: torch.cuda.synchronize(torch.cuda.current_device()) diff --git a/benchmarks/operator_benchmark/benchmark_runner.py b/benchmarks/operator_benchmark/benchmark_runner.py index 9dfab781498ea..6568cf9bf3ee6 100644 --- a/benchmarks/operator_benchmark/benchmark_runner.py +++ b/benchmarks/operator_benchmark/benchmark_runner.py @@ -62,6 +62,13 @@ def parse_args(): default=None, ) + parser.add_argument( + "--benchmark-name", + "--benchmark_name", + help="Name of the benchmark to store results to", + default="PyTorch operator benchmark", + ) + parser.add_argument( "--list-tests", "--list_tests", @@ -135,6 +142,16 @@ def parse_args(): help="Run operators with PyTorch JIT mode", ) + parser.add_argument( + "--use-compile", + "--use_compile", + type=benchmark_utils.str2bool, + nargs="?", + const=True, + default=False, + help="Run operators with PyTorch Compile mode", + ) + parser.add_argument( "--forward-only", "--forward_only", @@ -162,7 +179,7 @@ def parse_args(): "--output-json-for-dashboard", "--output_json_for_dashboard", help="Save results in JSON format for display on the OSS dashboard", - default="False", + default="benchmark-results.json", ) args, _ = parser.parse_known_args() diff --git a/buckbuild.bzl b/buckbuild.bzl index 09a515584d97c..218fd747301f9 100644 --- a/buckbuild.bzl +++ b/buckbuild.bzl @@ -11,7 +11,7 @@ load("//tools/build_defs:glob_defs.bzl", "subdir_glob") load("//tools/build_defs:platform_defs.bzl", "APPLETVOS", "IOS", "MACOSX") load("//tools/build_defs:type_defs.bzl", "is_list", "is_string") load("//tools/build_defs/android:build_mode_defs.bzl", is_production_build_android = "is_production_build") -load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build") +load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build", is_profile_build_ios = "is_profile_build") load( ":build_variables.bzl", "aten_cpu_source_list", @@ -74,7 +74,7 @@ def _is_build_mode_dev(): if is_production_build_android(): # Android Prod builds return False - if is_production_build_ios(): + if is_production_build_ios() or is_profile_build_ios(): # iOS Prod builds return False @@ -824,9 +824,13 @@ def get_pt_operator_registry_dict( apple_sdks = kwargs.get("apple_sdks"), ) + # Extract existing linker_flags from kwargs and combine with default flags + existing_linker_flags = kwargs.pop("linker_flags", []) + combined_linker_flags = get_no_as_needed_linker_flag() + existing_linker_flags + return dict( srcs = code_gen_files["srcs"], - linker_flags = get_no_as_needed_linker_flag(), + linker_flags = combined_linker_flags, # @lint-ignore BUCKLINT link_whole link_whole = True, soname = "libtorch-code-gen.$(ext)", @@ -944,6 +948,7 @@ def define_buck_targets( [ ("torch/csrc/api/include", "torch/**/*.h"), ("", "torch/csrc/**/*.h"), + ("", "torch/csrc/**/*.hpp"), ("", "torch/nativert/**/*.h"), ("", "torch/headeronly/**/*.h"), ("", "torch/script.h"), @@ -1144,6 +1149,9 @@ def define_buck_targets( "--replace", "@AT_KLEIDIAI_ENABLED@", "0", + "--replace", + "@AT_USE_EIGEN_SPARSE@", + "0", ]), outs = { "Config.h": ["Config.h"], @@ -2026,6 +2034,7 @@ def define_buck_targets( ("", "caffe2/utils/*.h"), ("", "caffe2/core/*.h"), ("", "torch/csrc/*.h"), + ("", "torch/csrc/*.hpp"), ("", "torch/csrc/api/include/torch/*.h"), ("", "torch/csrc/autograd/*.h"), ("", "torch/csrc/autograd/*/*.h"), diff --git a/build_variables.bzl b/build_variables.bzl index a226249db7089..05b1cfdc7a4b0 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -512,6 +512,7 @@ libtorch_distributed_base_sources = [ "torch/csrc/distributed/c10d/TCPStore.cpp", "torch/csrc/distributed/c10d/TCPStoreBackend.cpp", "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp", + "torch/csrc/distributed/c10d/Types.cpp", "torch/csrc/distributed/c10d/Utils.cpp", "torch/csrc/distributed/c10d/Work.cpp", "torch/csrc/distributed/c10d/comm.cpp", @@ -631,6 +632,16 @@ libtorch_nativert_sources = [ "torch/nativert/kernels/NativeKernels.cpp", "torch/nativert/kernels/GeneratedStaticDispatchKernels.cpp", "torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp", + "torch/nativert/graph/passes/SubgraphRewriter.cpp", + "torch/nativert/graph/passes/pass_manager/GraphPasses.cpp", + "torch/nativert/graph/passes/pass_manager/PassManager.cpp", + "torch/nativert/kernels/KernelHandlerRegistry.cpp", + "torch/nativert/kernels/TritonKernel.cpp", + "torch/nativert/executor/triton/CpuTritonKernelManager.cpp", +] + +libtorch_nativert_cuda_sources = [ + "torch/nativert/executor/triton/CudaTritonKernelManager.cpp", ] torch_mobile_tracer_sources = [ @@ -751,14 +762,22 @@ libtorch_cuda_distributed_extra_sources = [ "torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu", "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp", "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", + "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp", "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp", ] +libtorch_nvshmem_sources = [ + "torch/csrc/distributed/c10d/cuda/utils.cpp", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp", + "torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu", + "torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu", +] + libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [ "torch/csrc/cuda/nccl.cpp", -] +] + libtorch_nativert_cuda_sources torch_cpp_srcs = [ "torch/csrc/api/src/cuda.cpp", # this just forwards stuff, no real CUDA @@ -1075,6 +1094,7 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/DeviceAccelerator.cpp", "aten/src/ATen/Context.cpp", "aten/src/ATen/DLConvertor.cpp", + "aten/src/ATen/DTensorState.cpp", "aten/src/ATen/EmptyTensor.cpp", "aten/src/ATen/ExpandUtils.cpp", "aten/src/ATen/CachedTensorUtils.cpp", diff --git a/c10/core/AllocatorConfig.cpp b/c10/core/AllocatorConfig.cpp index e154338d501b2..c6b6e95f43b28 100644 --- a/c10/core/AllocatorConfig.cpp +++ b/c10/core/AllocatorConfig.cpp @@ -45,7 +45,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) { 63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart); const size_t interval_end = 63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd); - TORCH_CHECK_VALUE( + TORCH_CHECK( interval_end - interval_start == kRoundUpPowerOfTwoIntervals, "kRoundUpPowerOfTwoIntervals mismatch"); @@ -64,7 +64,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize( std::numeric_limits::max() / kMB; size_t val_env = tokenizer.toSizeT(++i); - TORCH_CHECK_VALUE( + TORCH_CHECK( val_env >= min_allowed_split_size_mb, "CachingAllocator option max_split_size_mb too small, must be >= ", min_allowed_split_size_mb); @@ -83,7 +83,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize( std::numeric_limits::max() / kMB; size_t val_env = tokenizer.toSizeT(++i); - TORCH_CHECK_VALUE( + TORCH_CHECK( val_env >= min_allowed_split_size_mb, "CachingAllocator option max_non_split_rounding_mb too small, must be >= ", min_allowed_split_size_mb); @@ -98,7 +98,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold( size_t i) { tokenizer.checkToken(++i, ":"); double val_env = tokenizer.toDouble(++i); - TORCH_CHECK_VALUE( + TORCH_CHECK( val_env > 0 && val_env < 1.0, "garbage_collect_threshold is invalid, set it in (0.0, 1.0)"); garbage_collection_threshold_ = val_env; @@ -119,7 +119,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions( size_t value_index = i; tokenizer.checkToken(++i, ":"); size_t value = tokenizer.toSizeT(++i); - TORCH_CHECK_VALUE( + TORCH_CHECK( value == 0 || llvm::isPowerOf2_64(value), "For roundups, the divisions has to be power of 2 or 0 to disable roundup "); @@ -133,7 +133,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions( value); } else { size_t boundary = tokenizer.toSizeT(value_index); - TORCH_CHECK_VALUE( + TORCH_CHECK( llvm::isPowerOf2_64(boundary), "For roundups, the intervals have to be power of 2 "); @@ -163,7 +163,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions( "Expected closing bracket ']' in ConfigTokenizer but reached end of config"); } else { // Keep this for backwards compatibility size_t value = tokenizer.toSizeT(i); - TORCH_CHECK_VALUE( + TORCH_CHECK( llvm::isPowerOf2_64(value), "For roundups, the divisions has to be power of 2 "); std::fill( diff --git a/c10/core/AllocatorConfig.h b/c10/core/AllocatorConfig.h index efde5e3a8ff98..68cc47a8417c2 100644 --- a/c10/core/AllocatorConfig.h +++ b/c10/core/AllocatorConfig.h @@ -76,7 +76,7 @@ class ConfigTokenizer { } else if (token == "False") { return false; } else { - TORCH_CHECK_VALUE( + TORCH_CHECK( false, "Expected 'True' or 'False' at index ", i, diff --git a/c10/core/Backend.h b/c10/core/Backend.h index 67c9276313bba..0497d72b95703 100644 --- a/c10/core/Backend.h +++ b/c10/core/Backend.h @@ -237,8 +237,6 @@ inline DeviceType backendToDeviceType(Backend b) { return DeviceType::CPU; case Backend::CUDA: case Backend::SparseCUDA: - case Backend::SparseMPS: - case Backend::SparseCsrMPS: case Backend::QuantizedCUDA: case Backend::SparseCsrCUDA: return DeviceType::CUDA; @@ -276,6 +274,8 @@ inline DeviceType backendToDeviceType(Backend b) { case Backend::Meta: return DeviceType::Meta; case Backend::MPS: + case Backend::SparseMPS: + case Backend::SparseCsrMPS: return DeviceType::MPS; case Backend::HPU: return DeviceType::HPU; diff --git a/c10/core/CachingDeviceAllocator.cpp b/c10/core/CachingDeviceAllocator.cpp new file mode 100644 index 0000000000000..582efd59cf1b1 --- /dev/null +++ b/c10/core/CachingDeviceAllocator.cpp @@ -0,0 +1,10 @@ +#include + +namespace c10 { + +// Ensures proper DLL export of this pure virtual base class on Windows, +// since it's mainly used in other DLLs outside c10.dll. +DeviceAllocator::DeviceAllocator() = default; +DeviceAllocator::~DeviceAllocator() = default; + +} // namespace c10 diff --git a/c10/core/CachingDeviceAllocator.h b/c10/core/CachingDeviceAllocator.h index b23490de693a8..0bec03ae417fa 100644 --- a/c10/core/CachingDeviceAllocator.h +++ b/c10/core/CachingDeviceAllocator.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace c10::CachingDeviceAllocator { @@ -59,3 +60,55 @@ struct DeviceStats { }; } // namespace c10::CachingDeviceAllocator + +namespace c10 { + +using CaptureId_t = unsigned long long; + +// first is set if the instance is created by Graph mode capture_begin. +// second is set if the instance is created by Graph mode graph_pool_handle. +using MempoolId_t = std::pair; + +struct C10_API DeviceAllocator : public c10::Allocator { + DeviceAllocator(); + ~DeviceAllocator() override; + + // Returns true if the allocator has been properly initialized and is ready + // for use + virtual bool initialized() = 0; + + // Releases all cached device memory from the specified memory pool back to + // the system + virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0; + + // Associates a memory allocation with a stream to establish dependency + // tracking. Prevents memory reuse until all operations on the specified + // stream complete + virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0; + + // Retrieves comprehensive memory statistics for the specified device, + // including allocation patterns, usage metrics + virtual CachingDeviceAllocator::DeviceStats getDeviceStats( + c10::DeviceIndex device) = 0; + + // Resets cumulative allocation statistics for the specified device to zero + virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0; + + // Resets peak memory usage statistics for the specified device + virtual void resetPeakStats(c10::DeviceIndex device) = 0; +}; + +// This function is used to get the DeviceAllocator for a specific device type +// and keep backward compatibility with c10::GetAllocator. +C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) { + TORCH_CHECK( + t != DeviceType::CPU, + "getDeviceAllocator is not supported for CPU device type."); + auto* allocator = c10::GetAllocator(t); + auto* device_allocator = dynamic_cast(allocator); + TORCH_INTERNAL_ASSERT( + device_allocator, "Allocator for ", t, " is not a DeviceAllocator."); + return device_allocator; +} + +} // namespace c10 diff --git a/c10/core/Contiguity.h b/c10/core/Contiguity.h index 279a795583b12..eed3f24983424 100644 --- a/c10/core/Contiguity.h +++ b/c10/core/Contiguity.h @@ -33,7 +33,8 @@ bool _compute_contiguous(ArrayRef sizes, ArrayRef strides, T numel) { } // Return a SymBool with underlying symbolic expression that represents -// contiguity. Guaranteed not to add guards. +// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions +// or symbolic True. inline static c10::SymBool _compute_contiguous_sym( ArrayRef sizes, ArrayRef strides, @@ -76,6 +77,8 @@ inline static c10::SymBool _compute_contiguous_sym( return true; }; + // We try to minimize creating large symbolic expressions when not needed to + // avoid symbolic evaluation perf issues. if (is_contiguous_or_false()) { return c10::SymBool(true); } @@ -94,6 +97,9 @@ inline static c10::SymBool _compute_contiguous_sym( return is_contiguous_cond.sym_or(is_empty); } +// When T is SymInt this function may throw a data dependent error. +// _compute_channels_last_contiguous_2d_sym does not. Only use this function +// when inputs are hinted. template bool _compute_channels_last_contiguous_2d( ArrayRef sizes, @@ -105,8 +111,8 @@ bool _compute_channels_last_contiguous_2d( T expected = 1; for (auto& d : {1, 3, 2, 0}) { const auto& size_d = sizes[d]; - if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) { - if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) { + if (size_d != 1) { + if (strides[d] != expected) { return false; } expected *= size_d; @@ -123,6 +129,65 @@ bool _compute_channels_last_contiguous_2d( } } +// Return a SymBool with underlying symbolic expression that represents +// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions +// or symbolic True. +inline static c10::SymBool _compute_channels_last_contiguous_2d_sym( + ArrayRef sizes, + ArrayRef strides) { + switch (sizes.size()) { + case 4: { + // When this function return True, result always true. When it return + // False, result could be False or data dependent. + auto guard_or_false = [&]() { + c10::SymInt expected = 1; + for (auto& d : {1, 3, 2, 0}) { + const auto& size_d = sizes[d]; + // Not taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) { + continue; + } + // Taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) { + return false; + } + expected *= size_d; + } + return true; + }; + + // We try to minimize creating large symbolic expressions when not needed + // to avoid symbolic evaluation perf issues. + if (guard_or_false()) { + return c10::SymBool(true); + } + + // Result is either false, or data dependent. + c10::SymInt expected_stride = 1; + c10::SymBool cond = true; + + for (auto& d : {1, 3, 2, 0}) { + const auto& size_d = sizes[d]; + cond = cond.sym_and( + size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride))); + expected_stride *= size_d; + } + return cond; + } + // NOLINTNEXTLINE(bugprone-branch-clone) + case 3: + // TODO dim == 3 case will be enabled once it is fully tested + return c10::SymBool(false); + default: + return c10::SymBool(false); + } +} + +// When T is SymInt this function may throw a data dependent error. +// _compute_channels_last_contiguous_3d_sym does not. Only use this function +// when inputs are hinted. template bool _compute_channels_last_contiguous_3d( ArrayRef sizes, @@ -134,8 +199,8 @@ bool _compute_channels_last_contiguous_3d( T expected = 1; for (auto& d : {1, 4, 3, 2, 0}) { const auto& size_d = sizes[d]; - if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) { - if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) { + if (size_d != 1) { + if (strides[d] != expected) { return false; } expected *= size_d; @@ -152,6 +217,59 @@ bool _compute_channels_last_contiguous_3d( } } +inline static c10::SymBool _compute_channels_last_contiguous_3d_sym( + ArrayRef sizes, + ArrayRef strides) { + switch (sizes.size()) { + case 5: { + // When this function return True, result always true. When it return + // False, result could be False or data dependent. + auto guard_or_false = [&]() { + c10::SymInt expected = 1; + for (auto& d : {1, 4, 3, 2, 0}) { + const auto& size_d = sizes[d]; + // Not taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) { + continue; + } + // Taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) { + return false; + } + expected *= size_d; + } + return true; + }; + + // We try to minimize creating large symbolic expressions when not needed + // to avoid symbolic evaluation perf issues. + if (guard_or_false()) { + return c10::SymBool(true); + } + + // Result is either false, or data dependent. + c10::SymInt expected_stride = 1; + c10::SymBool cond = true; + + for (auto& d : {1, 4, 3, 2, 0}) { + const auto& size_d = sizes[d]; + cond = cond.sym_and( + size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride))); + expected_stride *= size_d; + } + return cond; + } + // NOLINTNEXTLINE(bugprone-branch-clone) + case 4: + // TODO dim == 4 case will be enabled once it is fully tested + return c10::SymBool(false); + default: + return c10::SymBool(false); + } +} + template bool _compute_non_overlapping_and_dense( ArrayRef sizes, diff --git a/c10/core/Layout.h b/c10/core/Layout.h index 0daa129bb5a4f..0d09e0ed46f4e 100644 --- a/c10/core/Layout.h +++ b/c10/core/Layout.h @@ -33,7 +33,6 @@ inline Layout layout_from_backend(Backend backend) { case Backend::SparseCPU: case Backend::SparseCUDA: case Backend::SparseMPS: - case Backend::SparseCsrMPS: case Backend::SparseHIP: case Backend::SparseVE: case Backend::SparseXPU: @@ -43,6 +42,7 @@ inline Layout layout_from_backend(Backend backend) { return Layout::Mkldnn; case Backend::SparseCsrCPU: case Backend::SparseCsrCUDA: + case Backend::SparseCsrMPS: case Backend::SparseCsrHIP: case Backend::SparseCsrVE: case Backend::SparseCsrXPU: diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h index 3b483c86bc88f..646a1dde39940 100644 --- a/c10/core/Scalar.h +++ b/c10/core/Scalar.h @@ -191,11 +191,17 @@ class C10_API Scalar { isIntegral() const { return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag; } + bool isIntegral(bool includeBool) const { return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag || (includeBool && isBoolean()); } + // See Note [Meaning of HAS_u] + bool isUnsigned() const { + return Tag::HAS_u == tag || (Tag::HAS_i == tag && v.i >= 0); + } + bool isComplex() const { return Tag::HAS_z == tag; } diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h index 3d8a2b0074e9e..4a15eb23ac63c 100644 --- a/c10/core/ScalarType.h +++ b/c10/core/ScalarType.h @@ -19,25 +19,16 @@ #include #include -#include #include #include #include #include -namespace c10 { - -// dummy struct for uint1 to uint7, actual functionality -// of these dtypes will be implemented in python with Tensor subclass -template -struct dummy_uint1_7_t {}; +#include -// dummy struct for int1 to int7, actual functionality -// of these dtypes will be implemented in python with Tensor subclass -template -struct dummy_int1_7_t {}; +namespace c10 { -// For the macros below: +// [dtype Macros note] For the macros below: // // For users: If you want to macro some code for all non-QInt scalar types // (i.e. types with complete information, you probably want one of the @@ -57,56 +48,6 @@ struct dummy_int1_7_t {}; // some old PRs where we added new dtypes (check history of this file) can // help give you an idea where to start. -// NB: Order matters for this macro; it is relied upon in -// _promoteTypesLookup and the serialization format. -#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(_) \ - _(uint8_t, Byte) /* 0 */ \ - _(int8_t, Char) /* 1 */ \ - _(int16_t, Short) /* 2 */ \ - _(int, Int) /* 3 */ \ - _(int64_t, Long) /* 4 */ \ - _(at::Half, Half) /* 5 */ \ - _(float, Float) /* 6 */ \ - _(double, Double) /* 7 */ \ - _(c10::complex, ComplexHalf) /* 8 */ \ - _(c10::complex, ComplexFloat) /* 9 */ \ - _(c10::complex, ComplexDouble) /* 10 */ \ - _(bool, Bool) /* 11 */ \ - _(c10::qint8, QInt8) /* 12 */ \ - _(c10::quint8, QUInt8) /* 13 */ \ - _(c10::qint32, QInt32) /* 14 */ \ - _(at::BFloat16, BFloat16) /* 15 */ \ - _(c10::quint4x2, QUInt4x2) /* 16 */ \ - _(c10::quint2x4, QUInt2x4) /* 17 */ \ - _(c10::bits1x8, Bits1x8) /* 18 */ \ - _(c10::bits2x4, Bits2x4) /* 19 */ \ - _(c10::bits4x2, Bits4x2) /* 20 */ \ - _(c10::bits8, Bits8) /* 21 */ \ - _(c10::bits16, Bits16) /* 22 */ \ - _(c10::Float8_e5m2, Float8_e5m2) /* 23 */ \ - _(c10::Float8_e4m3fn, Float8_e4m3fn) /* 24 */ \ - _(c10::Float8_e5m2fnuz, Float8_e5m2fnuz) /* 25 */ \ - _(c10::Float8_e4m3fnuz, Float8_e4m3fnuz) /* 26 */ \ - _(uint16_t, UInt16) /* 27 */ \ - _(uint32_t, UInt32) /* 28 */ \ - _(uint64_t, UInt64) /* 29 */ \ - _(c10::dummy_uint1_7_t<1>, UInt1) /* 30 */ \ - _(c10::dummy_uint1_7_t<2>, UInt2) /* 31 */ \ - _(c10::dummy_uint1_7_t<3>, UInt3) /* 32 */ \ - _(c10::dummy_uint1_7_t<4>, UInt4) /* 33 */ \ - _(c10::dummy_uint1_7_t<5>, UInt5) /* 34 */ \ - _(c10::dummy_uint1_7_t<6>, UInt6) /* 35 */ \ - _(c10::dummy_uint1_7_t<7>, UInt7) /* 36 */ \ - _(c10::dummy_int1_7_t<1>, Int1) /* 37 */ \ - _(c10::dummy_int1_7_t<2>, Int2) /* 38 */ \ - _(c10::dummy_int1_7_t<3>, Int3) /* 39 */ \ - _(c10::dummy_int1_7_t<4>, Int4) /* 40 */ \ - _(c10::dummy_int1_7_t<5>, Int5) /* 41 */ \ - _(c10::dummy_int1_7_t<6>, Int6) /* 42 */ \ - _(c10::dummy_int1_7_t<7>, Int7) /* 43 */ \ - _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */ \ - _(c10::Float4_e2m1fn_x2, Float4_e2m1fn_x2) /* 45 */ - // If you want to support ComplexHalf for real, add ComplexHalf // into this macro (and change the name). But beware: convert() // doesn't work for all the conversions you need... @@ -152,17 +93,6 @@ struct dummy_int1_7_t {}; _(at::Float8_e4m3fnuz, Float8_e4m3fnuz) \ _(at::Float8_e8m0fnu, Float8_e8m0fnu) -enum class ScalarType : int8_t { -#define DEFINE_ST_ENUM_VAL_(_1, n) n, - AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ST_ENUM_VAL_) -#undef DEFINE_ENUM_ST_ENUM_VAL_ - Undefined, - NumOptions -}; - -constexpr uint16_t NumScalarTypes = - static_cast(ScalarType::NumOptions); - namespace impl { // These are used to map ScalarTypes to C++ types. diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp index c6c2743d8358a..b78ca94dc5145 100644 --- a/c10/core/SymInt.cpp +++ b/c10/core/SymInt.cpp @@ -20,6 +20,14 @@ void SymInt::promote_to_negative() { s.data_ = 0; } +std::optional SymInt::maybe_as_int_slow_path() const { + auto* node = toSymNodeImplUnowned(); + if (auto c = node->constant_int()) { + return c; + } + return node->maybe_as_int(); +} + SymNode SymInt::toSymNode() const { TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE( is_heap_allocated(), "SymInt::toSymNode is_heap_allocated"); @@ -45,12 +53,11 @@ bool SymInt::has_hint() const { #define DEFINE_BINARY(API, OP, METHOD, RET) \ RET SymInt::API(const SymInt& sci) const { \ if (auto ma = maybe_as_int()) { \ - if (auto mb = sci.maybe_as_int()) { \ - return RET(OP(*ma, *mb)); \ - } else { \ - auto b = sci.toSymNode(); \ - return RET(b->wrap_int(*ma)->METHOD(b)); \ - } \ + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( \ + !sci.maybe_as_int(), \ + "should have hit fast path in the header in this case."); \ + auto b = sci.toSymNode(); \ + return RET(b->wrap_int(*ma)->METHOD(b)); \ } else { \ if (auto mb = sci.maybe_as_int()) { \ auto a = toSymNodeImplUnowned(); \ @@ -61,19 +68,19 @@ bool SymInt::has_hint() const { } \ } -DEFINE_BINARY(operator+, std::plus<>(), add, SymInt) -DEFINE_BINARY(operator-, std::minus<>(), sub, SymInt) -DEFINE_BINARY(operator*, std::multiplies<>(), mul, SymInt) -DEFINE_BINARY(operator/, std::divides<>(), floordiv, SymInt) -DEFINE_BINARY(operator%, std::modulus<>(), mod, SymInt) -DEFINE_BINARY(sym_eq, std::equal_to<>(), eq, SymBool) -DEFINE_BINARY(sym_ne, std::not_equal_to<>(), ne, SymBool) -DEFINE_BINARY(sym_lt, std::less<>(), lt, SymBool) -DEFINE_BINARY(sym_le, std::less_equal<>(), le, SymBool) -DEFINE_BINARY(sym_gt, std::greater<>(), gt, SymBool) -DEFINE_BINARY(sym_ge, std::greater_equal<>(), ge, SymBool) -DEFINE_BINARY(min, std::min, sym_min, SymInt) -DEFINE_BINARY(max, std::max, sym_max, SymInt) +DEFINE_BINARY(operator_add_slow_path, std::plus<>(), add, SymInt) +DEFINE_BINARY(operator_sub_slow_path, std::minus<>(), sub, SymInt) +DEFINE_BINARY(operator_mul_slow_path, std::multiplies<>(), mul, SymInt) +DEFINE_BINARY(operator_div_slow_path, std::divides<>(), floordiv, SymInt) +DEFINE_BINARY(operator_mod_slow_path, std::modulus<>(), mod, SymInt) +DEFINE_BINARY(sym_eq_slow_path, std::equal_to<>(), eq, SymBool) +DEFINE_BINARY(sym_ne_slow_path, std::not_equal_to<>(), ne, SymBool) +DEFINE_BINARY(sym_lt_slow_path, std::less<>(), lt, SymBool) +DEFINE_BINARY(sym_le_slow_path, std::less_equal<>(), le, SymBool) +DEFINE_BINARY(sym_gt_slow_path, std::greater<>(), gt, SymBool) +DEFINE_BINARY(sym_ge_slow_path, std::greater_equal<>(), ge, SymBool) +DEFINE_BINARY(min_slow_path, std::min, sym_min, SymInt) +DEFINE_BINARY(max_slow_path, std::max, sym_max, SymInt) SymInt::operator SymFloat() const { if (auto ma = maybe_as_int()) { @@ -153,15 +160,15 @@ SymInt operator-(const SymInt& s) { } } -void SymInt::operator*=(const SymInt& sci) { +void SymInt::operator_imul_slow_path(const SymInt& sci) { *this = *this * sci; } -void SymInt::operator/=(const SymInt& sci) { +void SymInt::operator_idiv_slow_path(const SymInt& sci) { *this = *this / sci; } -void SymInt::operator+=(const SymInt& sci) { +void SymInt::operator_iadd_slow_path(const SymInt& sci) { *this = *this + sci; } diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h index 51686f8b81afb..9b1c776cbe2ab 100644 --- a/c10/core/SymInt.h +++ b/c10/core/SymInt.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -177,23 +178,136 @@ class C10_API SymInt { #endif } - SymInt operator+(const SymInt& sci) const; - SymInt operator-(const SymInt& sci) const; - SymInt operator*(const SymInt& sci) const; - SymInt operator/(const SymInt& sci) const; - SymInt operator%(const SymInt& sci) const; - void operator*=(const SymInt& sci); - void operator+=(const SymInt& sci); - void operator/=(const SymInt& sci); + SymInt operator+(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma + *mb); + } + } + return operator_add_slow_path(sci); + } + + SymInt operator-(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma - *mb); + } + } + return operator_sub_slow_path(sci); + } + + SymInt operator*(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma * *mb); + } + } + return operator_mul_slow_path(sci); + } + + SymInt operator/(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma / *mb); + } + } + return operator_div_slow_path(sci); + } + + SymInt operator%(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma % *mb); + } + } + return operator_mod_slow_path(sci); + } + + void operator*=(const SymInt& sci) { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + *this = SymInt(*ma * *mb); + return; + } + } + operator_imul_slow_path(sci); + } + + void operator+=(const SymInt& sci) { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + *this = SymInt(*ma + *mb); + return; + } + } + operator_iadd_slow_path(sci); + } + + void operator/=(const SymInt& sci) { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + *this = SymInt(*ma / *mb); + return; + } + } + operator_idiv_slow_path(sci); + } SymInt clone() const; - SymBool sym_eq(const SymInt&) const; - SymBool sym_ne(const SymInt&) const; - SymBool sym_lt(const SymInt&) const; - SymBool sym_le(const SymInt&) const; - SymBool sym_gt(const SymInt&) const; - SymBool sym_ge(const SymInt&) const; + SymBool sym_eq(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma == *mb); + } + } + return sym_eq_slow_path(sci); + } + + SymBool sym_ne(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma != *mb); + } + } + return sym_ne_slow_path(sci); + } + + SymBool sym_lt(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma < *mb); + } + } + return sym_lt_slow_path(sci); + } + + SymBool sym_le(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma <= *mb); + } + } + return sym_le_slow_path(sci); + } + + SymBool sym_gt(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma > *mb); + } + } + return sym_gt_slow_path(sci); + } + + SymBool sym_ge(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma >= *mb); + } + } + return sym_ge_slow_path(sci); + } bool operator==(const SymInt& o) const { return sym_eq(o).guard_bool(__FILE__, __LINE__); @@ -214,8 +328,23 @@ class C10_API SymInt { return sym_ge(o).guard_bool(__FILE__, __LINE__); } - SymInt min(const SymInt& sci) const; - SymInt max(const SymInt& sci) const; + SymInt min(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(std::min(*ma, *mb)); + } + } + return min_slow_path(sci); + } + + SymInt max(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(std::max(*ma, *mb)); + } + } + return max_slow_path(sci); + } // If both are symbolic, this checks if // they share the same node. @@ -239,11 +368,7 @@ class C10_API SymInt { if (!is_heap_allocated()) { return data_; } - auto* node = toSymNodeImplUnowned(); - if (auto c = node->constant_int()) { - return c; - } - return node->maybe_as_int(); + return maybe_as_int_slow_path(); } // Return whether the integer is directly coercible to a SymInt @@ -264,6 +389,25 @@ class C10_API SymInt { private: void promote_to_negative(); + SymInt operator_add_slow_path(const SymInt& sci) const; + SymInt operator_sub_slow_path(const SymInt& sci) const; + SymInt operator_mul_slow_path(const SymInt& sci) const; + SymInt operator_div_slow_path(const SymInt& sci) const; + SymInt operator_mod_slow_path(const SymInt& sci) const; + void operator_imul_slow_path(const SymInt& sci); + void operator_iadd_slow_path(const SymInt& sci); + void operator_idiv_slow_path(const SymInt& sci); + SymBool sym_eq_slow_path(const SymInt& sci) const; + SymBool sym_ne_slow_path(const SymInt& sci) const; + SymBool sym_lt_slow_path(const SymInt& sci) const; + SymBool sym_le_slow_path(const SymInt& sci) const; + SymBool sym_gt_slow_path(const SymInt& sci) const; + SymBool sym_ge_slow_path(const SymInt& sci) const; + + SymInt min_slow_path(const SymInt& sci) const; + SymInt max_slow_path(const SymInt& sci) const; + + std::optional maybe_as_int_slow_path() const; // Constraints on the internal representation: // diff --git a/c10/core/SymbolicShapeMeta.cpp b/c10/core/SymbolicShapeMeta.cpp index 6fa2ab0ed4f1d..01276d416fbb8 100644 --- a/c10/core/SymbolicShapeMeta.cpp +++ b/c10/core/SymbolicShapeMeta.cpp @@ -71,6 +71,27 @@ normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) { return std::tuple, std::vector>( std::move(base), std::move(size_nodes), std::move(stride_nodes)); } +namespace { +bool all_hinted( + const c10::SymIntArrayRef& sizes, + const c10::SymIntArrayRef& strides) { + auto all_hinted = true; + for (const auto& s : sizes) { + if (!s.has_hint()) { + return false; + } + } + + if (all_hinted) { + for (const auto& s : strides) { + if (!s.has_hint()) { + return false; + } + } + } + return all_hinted; +} +} // namespace // Special treatment because of numel SymBool SymbolicShapeMeta::compute_contiguous() const { @@ -88,28 +109,61 @@ SymBool SymbolicShapeMeta::compute_contiguous() const { return maybe_as_bool.value(); } - auto all_hinted = true; - for (const auto& s : sizes) { - if (!s.has_hint()) { - all_hinted = false; - break; - } + if (all_hinted(sizes, strides)) { + // We avoid going through the slow path if everything is hinted, + // because evaluating a large SymPy expression can be expensive. + // TODO exclude backed_size_oblivious from this path. + return _compute_contiguous(sizes_, strides_, numel()); } - if (all_hinted) { - for (const auto& s : strides) { - if (!s.has_hint()) { - all_hinted = false; - break; - } - } + return result; +} + +SymBool SymbolicShapeMeta::compute_channels_last_contiguous_2d() const { + if (!strides_valid_) { + return false; } + c10::SymIntArrayRef sizes(sizes_); + c10::SymIntArrayRef strides(strides_); - if (all_hinted) { + auto result = _compute_channels_last_contiguous_2d_sym(sizes, strides); + + // If the result is already determined without guarding, just return it. + auto maybe_as_bool = result.maybe_as_bool(); + if (maybe_as_bool.has_value()) { + return maybe_as_bool.value(); + } + + if (all_hinted(sizes, strides)) { // We avoid going through the slow path if everything is hinted, // because evaluating a large SymPy expression can be expensive. // TODO exclude backed_size_oblivious from this path. - return _compute_contiguous(sizes_, strides_, numel()); + return _compute_channels_last_contiguous_2d(sizes_, strides_); + } + + return result; +} + +SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d() const { + if (!strides_valid_) { + return false; + } + c10::SymIntArrayRef sizes(sizes_); + c10::SymIntArrayRef strides(strides_); + + auto result = _compute_channels_last_contiguous_3d_sym(sizes, strides); + + // If the result is already determined without guarding, just return it. + auto maybe_as_bool = result.maybe_as_bool(); + if (maybe_as_bool.has_value()) { + return maybe_as_bool.value(); + } + + if (all_hinted(sizes, strides)) { + // We avoid going through the slow path if everything is hinted, + // because evaluating a large SymPy expression can be expensive. + // TODO exclude backed_size_oblivious from this path. + return _compute_channels_last_contiguous_3d(sizes_, strides_); } return result; @@ -143,8 +197,6 @@ SymBool SymbolicShapeMeta::compute_contiguous() const { } // clang-format off -DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_2d, _compute_channels_last_contiguous_2d) -DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_3d, _compute_channels_last_contiguous_3d) DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_2d, is_channels_last_strides_2d) DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_3d, is_channels_last_strides_3d) diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index f3ec2f2d46ea2..cd0321d3bb6f5 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -313,8 +313,15 @@ void TensorImpl::throw_data_ptr_access_error() const { c10::SymBool TensorImpl::sym_is_contiguous_custom( at::MemoryFormat memory_format) const { if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) { - return pyobj_slot_.load_pyobj_interpreter()->is_contiguous( - this, memory_format); + // TO reduce BC breaking and reduce having to introduce + // sym_is_contiguous. call is_contiguous when tensor does not + if (C10_UNLIKELY(has_symbolic_sizes_strides_)) { + return pyobj_slot_.load_pyobj_interpreter()->sym_is_contiguous( + this, memory_format); + } else { + return pyobj_slot_.load_pyobj_interpreter()->is_contiguous( + this, memory_format); + } } return sym_is_contiguous_default(memory_format); diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 381bc65b27fbd..972181327b1f6 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -643,47 +643,43 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } } - // From https://stackoverflow.com/a/3057522/23845 - // TODO: does C++14 have a stdlib template for this? - template - struct identity { - typedef T type; - }; - template ArrayRef generic_sizes() { - return _generic_sizes(identity()); - } + static_assert( + std::is_same_v || std::is_same_v, + "Only supports int64_t and c10::SymInt."); - ArrayRef _generic_sizes(identity) { - return sizes(); - } - ArrayRef _generic_sizes(identity) { - return sym_sizes(); + if constexpr (std::is_same_v) { + return sizes(); + } else { + return sym_sizes(); + } } template ArrayRef generic_strides() { - return _generic_strides(identity()); - } + static_assert( + std::is_same_v || std::is_same_v, + "Only supports int64_t and c10::SymInt."); - ArrayRef _generic_strides(identity) { - return strides(); - } - ArrayRef _generic_strides(identity) { - return sym_strides(); + if constexpr (std::is_same_v) { + return strides(); + } else { + return sym_strides(); + } } template T generic_storage_offset() { - return _generic_storage_offset(identity()); - } + static_assert( + std::is_same_v || std::is_same_v, + "Only supports int64_t and c10::SymInt."); - int64_t _generic_storage_offset(identity) { - return storage_offset(); - } - c10::SymInt _generic_storage_offset(identity) { - return sym_storage_offset(); + if constexpr (std::is_same_v) { + return storage_offset(); + } else { + return sym_storage_offset(); + } } /** @@ -2090,6 +2086,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { constexpr auto sparse_backends = DispatchKeySet( {BackendComponent::CPUBit, BackendComponent::CUDABit, + BackendComponent::MPSBit, BackendComponent::HIPBit, BackendComponent::XPUBit}); constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse); diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp index b4ae1d612e961..913bc78726576 100644 --- a/c10/core/impl/PyInterpreter.cpp +++ b/c10/core/impl/PyInterpreter.cpp @@ -60,6 +60,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable { bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const override { PANIC(is_contiguous); } + c10::SymBool sym_is_contiguous(const TensorImpl* self, at::MemoryFormat) + const override { + PANIC(sym_is_contiguous); + } bool is_strides_like(const TensorImpl* self, at::MemoryFormat) const override { PANIC(is_strides_like); diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h index 09d4801f7d83d..def708c24b802 100644 --- a/c10/core/impl/PyInterpreter.h +++ b/c10/core/impl/PyInterpreter.h @@ -168,6 +168,9 @@ struct C10_API PyInterpreterVTable { virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const = 0; + virtual c10::SymBool sym_is_contiguous( + const TensorImpl* self, + at::MemoryFormat) const = 0; virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat) const = 0; virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0; diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp index 3ad84fd345ca5..8706f7362a3d2 100644 --- a/c10/cuda/CUDAAllocatorConfig.cpp +++ b/c10/cuda/CUDAAllocatorConfig.cpp @@ -1,119 +1,393 @@ #include +#include +#include #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) #include #endif -#include - namespace c10::cuda::CUDACachingAllocator { -size_t CUDAAllocatorConfig::parseAllocatorConfig( - const c10::CachingAllocator::ConfigTokenizer& tokenizer, +constexpr size_t kRoundUpPowerOfTwoIntervals = 16; + +CUDAAllocatorConfig::CUDAAllocatorConfig() + : m_max_split_size(std::numeric_limits::max()), + m_max_non_split_rounding_size(kLargeBuffer), + m_garbage_collection_threshold(0), + m_pinned_num_register_threads(1), + m_expandable_segments(false), +#if CUDA_VERSION >= 12030 + m_expandable_segments_handle_type( + Expandable_Segments_Handle_Type::UNSPECIFIED), +#else + m_expandable_segments_handle_type( + Expandable_Segments_Handle_Type::POSIX_FD), +#endif + m_release_lock_on_cudamalloc(false), + m_pinned_use_cuda_host_register(false), + m_graph_capture_record_stream_reuse(false), + m_pinned_use_background_threads(false) { + m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0); +} + +size_t CUDAAllocatorConfig::roundup_power2_divisions(size_t size) { + size_t log_size = (63 - llvm::countLeadingZeros(size)); + + // Our intervals start at 1MB and end at 64GB + const size_t interval_start = + 63 - llvm::countLeadingZeros(static_cast(1048576)); + const size_t interval_end = + 63 - llvm::countLeadingZeros(static_cast(68719476736)); + TORCH_CHECK( + (interval_end - interval_start == kRoundUpPowerOfTwoIntervals), + "kRoundUpPowerOfTwoIntervals mismatch"); + + int index = static_cast(log_size) - static_cast(interval_start); + + index = std::max(0, index); + index = std::min(index, static_cast(kRoundUpPowerOfTwoIntervals) - 1); + return instance().m_roundup_power2_divisions[index]; +} + +void CUDAAllocatorConfig::lexArgs( + const std::string& env, + std::vector& config) { + std::vector buf; + + for (char ch : env) { + if (ch == ',' || ch == ':' || ch == '[' || ch == ']') { + if (!buf.empty()) { + config.emplace_back(buf.begin(), buf.end()); + buf.clear(); + } + config.emplace_back(1, ch); + } else if (ch != ' ') { + buf.emplace_back(ch); + } + } + if (!buf.empty()) { + config.emplace_back(buf.begin(), buf.end()); + } +} + +void CUDAAllocatorConfig::consumeToken( + const std::vector& config, + size_t i, + const char c) { + TORCH_CHECK( + i < config.size() && config[i] == std::string(1, c), + "Error parsing CachingAllocator settings, expected ", + c, + ""); +} + +size_t CUDAAllocatorConfig::parseMaxSplitSize( + const std::vector& config, + size_t i) { + consumeToken(config, ++i, ':'); + constexpr int mb = 1024 * 1024; + if (++i < config.size()) { + size_t val1 = stoi(config[i]); + TORCH_CHECK( + val1 > kLargeBuffer / mb, + "CachingAllocator option max_split_size_mb too small, must be > ", + kLargeBuffer / mb, + ""); + val1 = std::max(val1, kLargeBuffer / mb); + val1 = std::min(val1, (std::numeric_limits::max() / mb)); + m_max_split_size = val1 * 1024 * 1024; + } else { + TORCH_CHECK(false, "Error, expecting max_split_size_mb value", ""); + } + return i; +} + +size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize( + const std::vector& config, size_t i) { + consumeToken(config, ++i, ':'); + constexpr int mb = 1024 * 1024; + if (++i < config.size()) { + size_t val1 = stoi(config[i]); + TORCH_CHECK( + val1 > kLargeBuffer / mb, + "CachingAllocator option max_non_split_rounding_mb too small, must be > ", + kLargeBuffer / mb, + ""); + val1 = std::max(val1, kLargeBuffer / mb); + val1 = std::min(val1, (std::numeric_limits::max() / mb)); + m_max_non_split_rounding_size = val1 * 1024 * 1024; + } else { + TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", ""); + } + return i; +} + +size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold( + const std::vector& config, + size_t i) { + consumeToken(config, ++i, ':'); + if (++i < config.size()) { + double val1 = stod(config[i]); + TORCH_CHECK( + val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", ""); + TORCH_CHECK( + val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", ""); + m_garbage_collection_threshold = val1; + } else { + TORCH_CHECK( + false, "Error, expecting garbage_collection_threshold value", ""); + } + return i; +} + +size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions( + const std::vector& config, + size_t i) { + consumeToken(config, ++i, ':'); + bool first_value = true; + + if (++i < config.size()) { + if (std::string_view(config[i]) == "[") { + size_t last_index = 0; + // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions) + while (++i < config.size() && std::string_view(config[i]) != "]") { + const std::string& val1 = config[i]; + size_t val2 = 0; + + consumeToken(config, ++i, ':'); + if (++i < config.size()) { + val2 = stoi(config[i]); + } else { + TORCH_CHECK( + false, "Error parsing roundup_power2_divisions value", ""); + } + TORCH_CHECK( + val2 == 0 || llvm::isPowerOf2_64(val2), + "For roundups, the divisions has to be power of 2 or 0 to disable roundup ", + ""); + + if (std::string_view(val1) == ">") { + std::fill( + std::next( + m_roundup_power2_divisions.begin(), + static_cast::difference_type>( + last_index)), + m_roundup_power2_divisions.end(), + val2); + } else { + size_t val1_long = stoul(val1); + TORCH_CHECK( + llvm::isPowerOf2_64(val1_long), + "For roundups, the intervals have to be power of 2 ", + ""); + + size_t index = 63 - llvm::countLeadingZeros(val1_long); + index = std::max((size_t)0, index); + index = std::min(index, m_roundup_power2_divisions.size() - 1); + + if (first_value) { + std::fill( + m_roundup_power2_divisions.begin(), + std::next( + m_roundup_power2_divisions.begin(), + static_cast::difference_type>( + index)), + val2); + first_value = false; + } + if (index < m_roundup_power2_divisions.size()) { + m_roundup_power2_divisions[index] = val2; + } + last_index = index; + } + + if (std::string_view(config[i + 1]) != "]") { + consumeToken(config, ++i, ','); + } + } + } else { // Keep this for backwards compatibility + size_t val1 = stoi(config[i]); + TORCH_CHECK( + llvm::isPowerOf2_64(val1), + "For roundups, the divisions has to be power of 2 ", + ""); + std::fill( + m_roundup_power2_divisions.begin(), + m_roundup_power2_divisions.end(), + val1); + } + } else { + TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", ""); + } + return i; +} + +size_t CUDAAllocatorConfig::parseAllocatorConfig( + const std::vector& config, + size_t i, + bool& used_cudaMallocAsync) { // For ease of maintenance and understanding, the CUDA and ROCm // implementations of this function are separated. This avoids having many // #ifdef's throughout. +#ifdef USE_ROCM // Ease burden on ROCm users by allowing either cuda or hip tokens. // cuda token is broken up to prevent hipify matching it. #define PYTORCH_TOKEN1 \ "cud" \ "aMallocAsync" #define PYTORCH_TOKEN2 "hipMallocAsync" - tokenizer.checkToken(++i, ":"); - i++; // Move to the value after the colon - TORCH_CHECK_VALUE( - ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) || - (tokenizer[i] == PYTORCH_TOKEN2)), - "Unknown allocator backend, " - "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2); - if (m_is_allocator_loaded) { - bool aync_allocator_at_runtime = (tokenizer[i] != "native"); - TORCH_CHECK( - aync_allocator_at_runtime == m_use_async_allocator, - "Allocator async backend parsed at runtime != allocator async backend parsed at load time, ", - aync_allocator_at_runtime, + consumeToken(config, ++i, ':'); + if (++i < config.size()) { + TORCH_CHECK( + ((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) || + (config[i] == PYTORCH_TOKEN2)), + "Unknown allocator backend, " + "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2); + used_cudaMallocAsync = + (config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2); + TORCH_INTERNAL_ASSERT( + config[i] == get()->name() || + (config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2), + "Allocator backend parsed at runtime != " + "allocator backend parsed at load time, ", + config[i], " != ", - m_use_async_allocator); + get()->name()); + } else { + TORCH_CHECK(false, "Error parsing backend value", ""); } - m_use_async_allocator = - (tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2); - // CUDA allocator is always loaded at the start of the program - m_is_allocator_loaded = true; - -#if defined(CUDA_VERSION) - if (m_use_async_allocator) { -#if CUDA_VERSION >= 11040 - int version = 0; - C10_CUDA_CHECK(cudaDriverGetVersion(&version)); + return i; +#undef PYTORCH_TOKEN1 +#undef PYTORCH_TOKEN2 +#else // USE_ROCM + consumeToken(config, ++i, ':'); + if (++i < config.size()) { TORCH_CHECK( - version >= 11040, - "backend:cudaMallocAsync requires CUDA runtime " - "11.4 or newer, but cudaDriverGetVersion returned ", - version); + ((config[i] == "native") || (config[i] == "cudaMallocAsync")), + "Unknown allocator backend, " + "options are native and cudaMallocAsync"); + used_cudaMallocAsync = (config[i] == "cudaMallocAsync"); + if (used_cudaMallocAsync) { +#if CUDA_VERSION >= 11040 + int version = 0; + C10_CUDA_CHECK(cudaDriverGetVersion(&version)); + TORCH_CHECK( + version >= 11040, + "backend:cudaMallocAsync requires CUDA runtime " + "11.4 or newer, but cudaDriverGetVersion returned ", + version); #else - TORCH_CHECK( - false, - "backend:cudaMallocAsync requires PyTorch to be built with " - "CUDA 11.4 or newer, but CUDA_VERSION is ", - CUDA_VERSION); + TORCH_CHECK( + false, + "backend:cudaMallocAsync requires PyTorch to be built with " + "CUDA 11.4 or newer, but CUDA_VERSION is ", + CUDA_VERSION); #endif + } + TORCH_INTERNAL_ASSERT( + config[i] == get()->name(), + "Allocator backend parsed at runtime != " + "allocator backend parsed at load time"); + } else { + TORCH_CHECK(false, "Error parsing backend value", ""); } -#endif - return i; -#undef PYTORCH_TOKEN1 -#undef PYTORCH_TOKEN2 +#endif // USE_ROCM } -void CUDAAllocatorConfig::parseArgs(const std::string& env) { +void CUDAAllocatorConfig::parseArgs(const std::optional& env) { // If empty, set the default values + m_max_split_size = std::numeric_limits::max(); + m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0); + m_garbage_collection_threshold = 0; + bool used_cudaMallocAsync = false; bool used_native_specific_option = false; - c10::CachingAllocator::ConfigTokenizer tokenizer(env); - for (size_t i = 0; i < tokenizer.size(); i++) { - const auto& key = tokenizer[i]; - if (key == "backend") { - i = parseAllocatorConfig(tokenizer, i); + if (!env.has_value()) { + return; + } + { + std::lock_guard lock(m_last_allocator_settings_mutex); + m_last_allocator_settings = env.value(); + } + + std::vector config; + lexArgs(env.value(), config); + + for (size_t i = 0; i < config.size(); i++) { + std::string_view config_item_view(config[i]); + if (config_item_view == "max_split_size_mb") { + i = parseMaxSplitSize(config, i); + used_native_specific_option = true; + } else if (config_item_view == "max_non_split_rounding_mb") { + i = parseMaxNonSplitRoundingSize(config, i); + used_native_specific_option = true; + } else if (config_item_view == "garbage_collection_threshold") { + i = parseGarbageCollectionThreshold(config, i); + used_native_specific_option = true; + } else if (config_item_view == "roundup_power2_divisions") { + i = parseRoundUpPower2Divisions(config, i); + used_native_specific_option = true; + } else if (config_item_view == "backend") { + i = parseAllocatorConfig(config, i, used_cudaMallocAsync); + } else if (config_item_view == "expandable_segments") { + used_native_specific_option = true; + consumeToken(config, ++i, ':'); + ++i; + TORCH_CHECK( + i < config.size() && + (std::string_view(config[i]) == "True" || + std::string_view(config[i]) == "False"), + "Expected a single True/False argument for expandable_segments"); + config_item_view = config[i]; + m_expandable_segments = (config_item_view == "True"); } else if ( // ROCm build's hipify step will change "cuda" to "hip", but for ease of // use, accept both. We must break up the string to prevent hipify here. - key == "release_lock_on_hipmalloc" || - key == + config_item_view == "release_lock_on_hipmalloc" || + config_item_view == "release_lock_on_c" "udamalloc") { used_native_specific_option = true; - tokenizer.checkToken(++i, ":"); - m_release_lock_on_cudamalloc = tokenizer.toBool(++i); + consumeToken(config, ++i, ':'); + ++i; + TORCH_CHECK( + i < config.size() && + (std::string_view(config[i]) == "True" || + std::string_view(config[i]) == "False"), + "Expected a single True/False argument for release_lock_on_cudamalloc"); + config_item_view = config[i]; + m_release_lock_on_cudamalloc = (config_item_view == "True"); } else if ( // ROCm build's hipify step will change "cuda" to "hip", but for ease of // use, accept both. We must break up the string to prevent hipify here. - key == "pinned_use_hip_host_register" || - key == + config_item_view == "pinned_use_hip_host_register" || + config_item_view == "pinned_use_c" "uda_host_register") { - i = parsePinnedUseCudaHostRegister(tokenizer, i); + i = parsePinnedUseCudaHostRegister(config, i); used_native_specific_option = true; - } else if (key == "pinned_num_register_threads") { - i = parsePinnedNumRegisterThreads(tokenizer, i); + } else if (config_item_view == "pinned_num_register_threads") { + i = parsePinnedNumRegisterThreads(config, i); + used_native_specific_option = true; + } else if (config_item_view == "pinned_use_background_threads") { + i = parsePinnedUseBackgroundThreads(config, i); + used_native_specific_option = true; + } else if (config_item_view == "graph_capture_record_stream_reuse") { + i = parseGraphCaptureRecordStreamReuse(config, i); used_native_specific_option = true; } else { - const auto& keys = - c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys(); TORCH_CHECK( - keys.find(key) != keys.end(), - "Unrecognized key '", - key, - "' in Accelerator allocator config."); - i = tokenizer.skipKey(i); + false, "Unrecognized CachingAllocator option: ", config_item_view); } - if (i + 1 < tokenizer.size()) { - tokenizer.checkToken(++i, ","); + if (i + 1 < config.size()) { + consumeToken(config, ++i, ','); } } - if (m_use_async_allocator && used_native_specific_option) { + if (used_cudaMallocAsync && used_native_specific_option) { TORCH_WARN( "backend:cudaMallocAsync ignores max_split_size_mb," "roundup_power2_divisions, and garbage_collect_threshold."); @@ -121,33 +395,81 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) { } size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister( - const c10::CachingAllocator::ConfigTokenizer& tokenizer, + const std::vector& config, size_t i) { - tokenizer.checkToken(++i, ":"); - m_pinned_use_cuda_host_register = tokenizer.toBool(++i); + consumeToken(config, ++i, ':'); + if (++i < config.size()) { + TORCH_CHECK( + (config[i] == "True" || config[i] == "False"), + "Expected a single True/False argument for pinned_use_cuda_host_register"); + m_pinned_use_cuda_host_register = (config[i] == "True"); + } else { + TORCH_CHECK( + false, "Error, expecting pinned_use_cuda_host_register value", ""); + } + return i; +} + +size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse( + const std::vector& config, + size_t i) { + consumeToken(config, ++i, ':'); + if (++i < config.size()) { + TORCH_CHECK( + (config[i] == "True" || config[i] == "False"), + "Expected a single True/False argument for graph_capture_record_stream_reuse"); + m_graph_capture_record_stream_reuse = (config[i] == "True"); + } else { + TORCH_CHECK( + false, "Error, expecting graph_capture_record_stream_reuse value", ""); + } return i; } size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads( - const c10::CachingAllocator::ConfigTokenizer& tokenizer, + const std::vector& config, size_t i) { - tokenizer.checkToken(++i, ":"); - size_t val2 = tokenizer.toSizeT(++i); - TORCH_CHECK_VALUE( - llvm::isPowerOf2_64(val2), - "Number of register threads has to be power of 2 ", - ""); - auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads(); - TORCH_CHECK_VALUE( - val2 <= maxThreads, - "Number of register threads should be less than or equal to " + - std::to_string(maxThreads), - ""); - m_pinned_num_register_threads = val2; + consumeToken(config, ++i, ':'); + if (++i < config.size()) { + size_t val2 = stoi(config[i]); + TORCH_CHECK( + llvm::isPowerOf2_64(val2), + "Number of register threads has to be power of 2 ", + ""); + auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads(); + TORCH_CHECK( + val2 <= maxThreads, + "Number of register threads should be less than or equal to " + + std::to_string(maxThreads), + ""); + m_pinned_num_register_threads = val2; + } else { + TORCH_CHECK( + false, "Error, expecting pinned_num_register_threads value", ""); + } return i; } -REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig) +size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads( + const std::vector& config, + size_t i) { + consumeToken(config, ++i, ':'); + if (++i < config.size()) { + TORCH_CHECK( + (config[i] == "True" || config[i] == "False"), + "Expected a single True/False argument for pinned_use_background_threads"); + m_pinned_use_background_threads = (config[i] == "True"); + } else { + TORCH_CHECK( + false, "Error, expecting pinned_use_background_threads value", ""); + } + return i; +} + +// General caching allocator utilities +void setAllocatorSettings(const std::string& env) { + CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str()); +} } // namespace c10::cuda::CUDACachingAllocator diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h index 21d72e4b68313..54c41ba70fb6f 100644 --- a/c10/cuda/CUDAAllocatorConfig.h +++ b/c10/cuda/CUDAAllocatorConfig.h @@ -1,12 +1,16 @@ #pragma once -#include -#include #include -#include #include #include +#include +#include +#include +#include +#include +#include + namespace c10::cuda::CUDACachingAllocator { enum class Expandable_Segments_Handle_Type : int { @@ -18,28 +22,21 @@ enum class Expandable_Segments_Handle_Type : int { // Environment config parser class C10_CUDA_API CUDAAllocatorConfig { public: - C10_DEPRECATED_MESSAGE( - "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.") static size_t max_split_size() { - return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size(); + return instance().m_max_split_size; } - C10_DEPRECATED_MESSAGE( - "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.") static double garbage_collection_threshold() { - return c10::CachingAllocator::AcceleratorAllocatorConfig:: - garbage_collection_threshold(); + return instance().m_garbage_collection_threshold; } static bool expandable_segments() { - bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig:: - use_expandable_segments(); #ifndef PYTORCH_C10_DRIVER_API_SUPPORTED - if (enabled) { + if (instance().m_expandable_segments) { TORCH_WARN_ONCE("expandable_segments not supported on this platform") } return false; #else - return enabled; + return instance().m_expandable_segments; #endif } @@ -56,6 +53,10 @@ class C10_CUDA_API CUDAAllocatorConfig { return instance().m_release_lock_on_cudamalloc; } + static bool graph_capture_record_stream_reuse() { + return instance().m_graph_capture_record_stream_reuse; + } + /** Pinned memory allocator settings */ static bool pinned_use_cuda_host_register() { return instance().m_pinned_use_cuda_host_register; @@ -65,11 +66,8 @@ class C10_CUDA_API CUDAAllocatorConfig { return instance().m_pinned_num_register_threads; } - C10_DEPRECATED_MESSAGE( - "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.") static bool pinned_use_background_threads() { - return c10::CachingAllocator::AcceleratorAllocatorConfig:: - pinned_use_background_threads(); + return instance().m_pinned_use_background_threads; } static size_t pinned_max_register_threads() { @@ -79,107 +77,96 @@ class C10_CUDA_API CUDAAllocatorConfig { return 128; } - C10_DEPRECATED_MESSAGE( - "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.") - static size_t roundup_power2_divisions(size_t size) { - return c10::CachingAllocator::AcceleratorAllocatorConfig:: - roundup_power2_divisions(size); - } + // This is used to round-up allocation size to nearest power of 2 divisions. + // More description below in function roundup_power2_next_division + // As an example, if we want 4 divisions between 2's power, this can be done + // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4 + static size_t roundup_power2_divisions(size_t size); - C10_DEPRECATED_MESSAGE( - "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.") static std::vector roundup_power2_divisions() { - return c10::CachingAllocator::AcceleratorAllocatorConfig:: - roundup_power2_divisions(); + return instance().m_roundup_power2_divisions; } - C10_DEPRECATED_MESSAGE( - "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_non_split_rounding_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_non_split_rounding_size() instead.") static size_t max_non_split_rounding_size() { - return c10::CachingAllocator::AcceleratorAllocatorConfig:: - max_non_split_rounding_size(); + return instance().m_max_non_split_rounding_size; } - C10_DEPRECATED_MESSAGE( - "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.") static std::string last_allocator_settings() { - return c10::CachingAllocator::getAllocatorSettings(); - } - - static bool use_async_allocator() { - return instance().m_use_async_allocator; - } - - // Use `Construct On First Use Idiom` to avoid `Static Initialization Order` - // issue. - static const std::unordered_set& getKeys() { - static std::unordered_set keys{ - "backend", - // keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues - // NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors) - "release_lock_on_cud" - "amalloc", - "pinned_use_cud" - "a_host_register", - // NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors) - "release_lock_on_hipmalloc", - "pinned_use_hip_host_register", - "pinned_num_register_threads"}; - return keys; + std::lock_guard lock( + instance().m_last_allocator_settings_mutex); + return instance().m_last_allocator_settings; } static CUDAAllocatorConfig& instance() { static CUDAAllocatorConfig* s_instance = ([]() { auto inst = new CUDAAllocatorConfig(); - auto env = c10::utils::get_env("PYTORCH_ALLOC_CONF"); - if (!env.has_value()) { - // For backward compatibility, check for the old environment variable - // PYTORCH_CUDA_ALLOC_CONF. - env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF"); - } + auto env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF"); #ifdef USE_ROCM // convenience for ROCm users, allow alternative HIP token if (!env.has_value()) { env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF"); } #endif - if (env.has_value()) { - inst->parseArgs(env.value()); - } + inst->parseArgs(env); return inst; })(); return *s_instance; } - void parseArgs(const std::string& env); + void parseArgs(const std::optional& env); private: - CUDAAllocatorConfig() = default; - - size_t parseAllocatorConfig( - const c10::CachingAllocator::ConfigTokenizer& tokenizer, + CUDAAllocatorConfig(); + + static void lexArgs(const std::string& env, std::vector& config); + static void consumeToken( + const std::vector& config, + size_t i, + const char c); + size_t parseMaxSplitSize(const std::vector& config, size_t i); + size_t parseMaxNonSplitRoundingSize( + const std::vector& config, + size_t i); + size_t parseGarbageCollectionThreshold( + const std::vector& config, + size_t i); + size_t parseRoundUpPower2Divisions( + const std::vector& config, size_t i); + size_t parseAllocatorConfig( + const std::vector& config, + size_t i, + bool& used_cudaMallocAsync); size_t parsePinnedUseCudaHostRegister( - const c10::CachingAllocator::ConfigTokenizer& tokenizer, + const std::vector& config, size_t i); size_t parsePinnedNumRegisterThreads( - const c10::CachingAllocator::ConfigTokenizer& tokenizer, + const std::vector& config, + size_t i); + size_t parsePinnedUseBackgroundThreads( + const std::vector& config, + size_t i); + size_t parseGraphCaptureRecordStreamReuse( + const std::vector& config, size_t i); - std::atomic m_pinned_num_register_threads{1}; - std::atomic m_expandable_segments_handle_type -#if CUDA_VERSION >= 12030 - {Expandable_Segments_Handle_Type::UNSPECIFIED}; -#else - {Expandable_Segments_Handle_Type::POSIX_FD}; -#endif - std::atomic m_release_lock_on_cudamalloc{false}; - std::atomic m_pinned_use_cuda_host_register{false}; - std::atomic m_use_async_allocator{false}; - std::atomic m_is_allocator_loaded{false}; + std::atomic m_max_split_size; + std::atomic m_max_non_split_rounding_size; + std::vector m_roundup_power2_divisions; + std::atomic m_garbage_collection_threshold; + std::atomic m_pinned_num_register_threads; + std::atomic m_expandable_segments; + std::atomic + m_expandable_segments_handle_type; + std::atomic m_release_lock_on_cudamalloc; + std::atomic m_pinned_use_cuda_host_register; + std::atomic m_graph_capture_record_stream_reuse; + std::atomic m_pinned_use_background_threads; + std::string m_last_allocator_settings; + std::mutex m_last_allocator_settings_mutex; }; -// Keep this for backwards compatibility -using c10::CachingAllocator::setAllocatorSettings; +// General caching allocator utilities +C10_CUDA_API void setAllocatorSettings(const std::string& env); } // namespace c10::cuda::CUDACachingAllocator diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index c2a46ac9f3f74..93ac4f7a4c649 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -63,6 +64,10 @@ namespace cuda::CUDACachingAllocator { using namespace c10::CachingAllocator; using namespace c10::CachingDeviceAllocator; +// Included here as this is externally used in CUDAAllocatorConfig +const size_t kLargeBuffer = + 20971520; // "large" allocations may be packed in 20 MiB blocks + namespace Native { // @@ -1162,8 +1167,13 @@ class DeviceCachingAllocator { // tracks which pools we can use as a last resort before ooming ska::flat_hash_set use_on_oom_pools; - // See free() for this thing's purpose - std::vector needs_events_deferred_until_no_capture; + // Map of blocks whose freeing is deferred until after CUDA graph capture. + // - Key: Block* to be freed. + // - Value: List of "empty nodes" inserted as free markers during capture. + // If the vector is empty, the block must always be deferred until capture + // ends. + ska::flat_hash_map> deferred_blocks; + // outstanding cuda events ska::flat_hash_map< cuda::CUDAStream, @@ -1218,7 +1228,7 @@ class DeviceCachingAllocator { DeviceCachingAllocator() : large_blocks(/*small=*/false), small_blocks(/*small=*/true) { stats.max_split_size = - static_cast(AcceleratorAllocatorConfig::max_split_size()); + static_cast(CUDAAllocatorConfig::max_split_size()); context_recorder_.store(nullptr); } @@ -1324,6 +1334,11 @@ class DeviceCachingAllocator { // capture. Cross-stream memory use is uncommon, so the deferral's // effect on memory use during capture should be small. process_events(context); + } else { + if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) { + // We check if there is some block that is safe to reuse on this stream + free_safe_blocks_in_capture(context, stream); + } } size_t size = round_size(orig_size); auto& pool = get_pool(size, stream); @@ -1343,8 +1358,7 @@ class DeviceCachingAllocator { // Do garbage collection if the flag is set. if (C10_UNLIKELY( set_fraction && - AcceleratorAllocatorConfig::garbage_collection_threshold() > - 0.0)) { + CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) { garbage_collect_cached_blocks(context); } // Attempt allocate @@ -1596,7 +1610,7 @@ class DeviceCachingAllocator { stats.active_bytes[stat_type].increase(block->size); stats.requested_bytes[stat_type].increase(block->requested_size); }); - if (block->size >= AcceleratorAllocatorConfig::max_split_size()) + if (block->size >= CUDAAllocatorConfig::max_split_size()) stats.oversize_allocations.increase(1); auto allocated_bytes_gauge = @@ -1615,6 +1629,248 @@ class DeviceCachingAllocator { return block; } + // Insert "free marker" (empty nodes) into the CUDA graph for all streams that + // have used the block, including the allocation stream. These nodes mark the + // last use of the block in the capture graph. Returns a vector of the + // inserted nodes, or an empty vector if any stream is not capturing. + std::vector insert_free_marker(Block* block) { + std::vector empty_nodes; + + auto try_add_empty_node = [&](cudaStream_t stream) -> bool { + cudaStreamCaptureStatus status{}; + cudaGraph_t graph{}; + const cudaGraphNode_t* deps = nullptr; + size_t num_deps = 0; +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000) + C10_CUDA_CHECK(cudaStreamGetCaptureInfo( + stream, &status, nullptr, &graph, &deps, nullptr, &num_deps)); +#else + C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2( + stream, &status, nullptr, &graph, &deps, &num_deps)); +#endif + + TORCH_INTERNAL_ASSERT( + status != cudaStreamCaptureStatusInvalidated, + "Invalid stream capture status"); + + if (status == cudaStreamCaptureStatusNone) { + return false; + } + + cudaGraphNode_t node{}; + C10_CUDA_CHECK(cudaGraphAddEmptyNode(&node, graph, deps, num_deps)); +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000) + C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies( + stream, &node, nullptr, 1, cudaStreamSetCaptureDependencies)); +#else + C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies( + stream, &node, 1, cudaStreamSetCaptureDependencies)); +#endif + empty_nodes.push_back(node); + return true; + }; + + // If any stream is not currently capturing, return an empty node vector. + // An empty vector indicates that the block should be deferred for freeing + // until after capture. + + // Attempt to add an empty node for the allocation stream. + if (!try_add_empty_node(block->stream)) { + return {}; + } + // Attempt to add empty nodes for all streams that have used the block. + for (const auto& s : block->stream_uses) { + if (!try_add_empty_node(s.stream())) { + return {}; + } + } + return empty_nodes; + } + + // Returns the current set of "terminal" nodes in the CUDA graph for a given + // stream. These represent the current endpoints of the stream, and may + // include additional nodes if the graph branches. Any new work captured will + // be attached after one or more of these terminals. + std::vector get_terminals(cudaStream_t stream) { + std::vector result; + + cudaStreamCaptureStatus status{}; + cudaGraph_t graph{}; + const cudaGraphNode_t* dependencies = nullptr; + size_t num_dependencies = 0; + +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000) + C10_CUDA_CHECK(cudaStreamGetCaptureInfo( + stream, + &status, + nullptr, + &graph, + &dependencies, + nullptr, + &num_dependencies)); +#else + C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2( + stream, &status, nullptr, &graph, &dependencies, &num_dependencies)); +#endif + + TORCH_INTERNAL_ASSERT( + status == cudaStreamCaptureStatusActive, + "Invalid stream capture status"); + + for (size_t i = 0; i < num_dependencies; i++) { + auto node = dependencies[i]; + if (node != nullptr) { + result.push_back(node); + } + } + + return result; + } + + // Returns the set of "reusable" free markers (empty nodes) in the current + // CUDA graph capture. A free marker is considered reusable if it is a + // predecessor of every terminal node. + // This ensures that all future captured work will occur after the free + // marker, making it safe to reuse. + ska::flat_hash_set get_reusable_empty_nodes( + cudaStream_t stream) { + auto terminals = get_terminals(stream); + if (terminals.empty()) { + // No terminal nodes found; nothing to free. + return {}; + } + + auto get_dependencies = [](cudaGraphNode_t node, + cudaGraphNode_t* pDependencies, + size_t* pNumDependencies) -> void { +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000) + C10_CUDA_CHECK(cudaGraphNodeGetDependencies( + node, pDependencies, nullptr, pNumDependencies)); +#else + C10_CUDA_CHECK( + cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies)); +#endif + }; + + // Helper to retrieve all parent nodes (dependencies) of a given node. + auto get_parents = + [&](cudaGraphNode_t node) -> std::vector { + size_t count = 0; + get_dependencies(node, nullptr, &count); + std::vector out(count); + if (count) { + get_dependencies(node, out.data(), &count); + out.resize(count); + } + return out; + }; + + // Helper to determine if a node is an empty node (used as a free marker). + auto is_empty_node = [](cudaGraphNode_t n) -> bool { + cudaGraphNodeType type{}; + C10_CUDA_CHECK(cudaGraphNodeGetType(n, &type)); + return type == cudaGraphNodeTypeEmpty; + }; + + // For each terminal node, perform a reverse DFS to count, for each empty + // node, how many terminals it can reach (i.e., for how many terminals it is + // a predecessor). An empty node is reusable if it is a predecessor of all + // terminal nodes. + ska::flat_hash_map num_terminals_reachable; + + for (auto terminal : terminals) { + ska::flat_hash_set visited; + ska::flat_hash_set empty_nodes; + + std::function reverse_dfs = + [&](cudaGraphNode_t node) { + if (!visited.insert(node).second) + return; + + if (is_empty_node(node)) { + num_terminals_reachable[node]++; + empty_nodes.insert(node); + } + auto parents = get_parents(node); + for (auto p : parents) { + reverse_dfs(p); + } + }; + + reverse_dfs(terminal); + } + + ska::flat_hash_set reusable_empty_nodes; + for (auto [node, count] : num_terminals_reachable) { + if (count == terminals.size()) { + reusable_empty_nodes.insert(node); + } + } + + return reusable_empty_nodes; + } + + // A block is considered reusable during CUDA graph capture if every free + // marker (empty node) associated with the block is a predecessor of every + // terminal node. + // + // This ensures that any new operation added to the graph will be attached + // after all terminal nodes, which themselves are after all free markers. As a + // result, all future work is guaranteed to occur after the block's last use + // on every stream, so the block's previous lifetime ends before any new + // lifetime begins. This check relies solely on the DAG topology and does not + // require event queries, making it safe to use during capture. + // + // This function iterates over all deferred blocks, determines if their empty + // nodes are reusable according to the above criteria, and frees the block if + // so. + void free_safe_blocks_in_capture( + const std::shared_ptr& context, + cudaStream_t stream) { + auto reusable_empty_nodes = get_reusable_empty_nodes(stream); + + // If there are no reusable empty nodes (e.g., not currently capturing), + // there is nothing to do. + if (reusable_empty_nodes.empty()) { + return; + } + + std::vector blocks_to_erase; + + for (auto& [block, inserted_empty_nodes] : deferred_blocks) { + // Skip this block if it has no empty nodes, as we defer its freeing until + // after graph capture. Also skip if the block was not allocated on the + // current stream; such blocks will be freed when + // free_safe_blocks_in_capture is attempted on that stream. + if (inserted_empty_nodes.empty() || block->stream != stream) { + continue; + } + + bool is_reusable = true; + + for (const auto& node : inserted_empty_nodes) { + if (reusable_empty_nodes.find(node) == reusable_empty_nodes.end()) { + is_reusable = false; + break; + } + } + + if (is_reusable) { + // Clear stream uses since the graph ensures proper synchronization. + // No need to insert events. + block->stream_uses.clear(); + + free_block(block, context); + blocks_to_erase.push_back(block); + } + } + + // Remove blocks that were freed from the deferred_blocks map. + for (auto* block : blocks_to_erase) { + deferred_blocks.erase(block); + } + } + void free(Block* block) { std::shared_ptr context = maybeGatherContext(RecordContext::ALL); @@ -1647,17 +1903,25 @@ class DeviceCachingAllocator { block->pool->owner_MempoolId(), context ? context : block->context_when_allocated); - if (block->size >= AcceleratorAllocatorConfig::max_split_size()) + if (block->size >= CUDAAllocatorConfig::max_split_size()) stats.oversize_allocations.decrease(1); + // If the block has been used on more than one stream, handle accordingly. if (!block->stream_uses.empty()) { if (C10_UNLIKELY(!captures_underway.empty())) { - // It's forbidden to cudaEventQuery an event recorded during CUDA graph - // capture. We conservatively defer recording end-of-life events until - // the next call to process_events() (which won't happen until no - // captures are underway) - needs_events_deferred_until_no_capture.push_back(block); + if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) { + // insert_free_marker returns a vector of free markers, + // or an empty vector if any associated stream is not currently + // capturing. The empty vector means that we will defer the free until + // capture is finished. + deferred_blocks.emplace(block, insert_free_marker(block)); + } else { + // If graph_capture_record_stream_reuse is not enabled, always defer + // the free until capture is finished. + deferred_blocks.emplace(block, std::vector{}); + } } else { + // If not in a capture, insert events for the block. insert_events(block); } } else { @@ -2196,8 +2460,7 @@ class DeviceCachingAllocator { if (size < kMinBlockSize) { return kMinBlockSize; } else { - auto divisions = - AcceleratorAllocatorConfig::roundup_power2_divisions(size); + auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size); if (divisions > 1 && size > (kMinBlockSize * divisions)) { return roundup_power2_next_division(size, divisions); } else { @@ -2676,7 +2939,7 @@ class DeviceCachingAllocator { if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) { return remaining >= kMinBlockSize; } else { - return (size < AcceleratorAllocatorConfig::max_split_size()) && + return (size < CUDAAllocatorConfig::max_split_size()) && (remaining > kSmallSize); } } @@ -2696,7 +2959,7 @@ class DeviceCachingAllocator { if (C10_UNLIKELY( set_fraction && - AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) { + CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) { // Track block reuse interval only when garbage collection is enabled. ++pool.get_free_blocks_call_count; } @@ -2738,13 +3001,13 @@ class DeviceCachingAllocator { } // Do not return an oversized block for a large request - if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) && - ((*it)->size >= AcceleratorAllocatorConfig::max_split_size())) + if ((p.size() < CUDAAllocatorConfig::max_split_size()) && + ((*it)->size >= CUDAAllocatorConfig::max_split_size())) return false; // Allow oversized block size to be rounded up but within a limit - if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) && + if ((p.size() >= CUDAAllocatorConfig::max_split_size()) && ((*it)->size >= - p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size())) + p.size() + CUDAAllocatorConfig::max_non_split_rounding_size())) return false; p.block = *it; pool.blocks.erase(it); @@ -2767,7 +3030,7 @@ class DeviceCachingAllocator { // therefore should be of less overheads. size_t gc_threshold = static_cast( - AcceleratorAllocatorConfig::garbage_collection_threshold() * + CUDAAllocatorConfig::garbage_collection_threshold() * static_cast(allowed_memory_maximum)); // No need to trigger GC yet if (total_allocated_memory <= gc_threshold) { @@ -2915,7 +3178,7 @@ class DeviceCachingAllocator { stats.segment[stat_type].increase(1); stats.reserved_bytes[stat_type].increase(size); }); - if (size >= AcceleratorAllocatorConfig::max_split_size()) + if (size >= CUDAAllocatorConfig::max_split_size()) stats.oversize_segments.increase(1); auto reserved_bytes_gauge = STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes); @@ -2944,7 +3207,7 @@ class DeviceCachingAllocator { bool release_available_cached_blocks( const AllocParams& p, const std::shared_ptr& context) { - if (AcceleratorAllocatorConfig::max_split_size() == + if (CUDAAllocatorConfig::max_split_size() == std::numeric_limits::max()) return false; BlockPool& pool = *p.pool; @@ -2952,8 +3215,8 @@ class DeviceCachingAllocator { // because of std::unique_ptr, block cannot be trivially copied // Use constructor for search key. Block key(p.search_key.device, p.search_key.stream, p.search_key.size); - key.size = (key.size < AcceleratorAllocatorConfig::max_split_size()) - ? AcceleratorAllocatorConfig::max_split_size() + key.size = (key.size < CUDAAllocatorConfig::max_split_size()) + ? CUDAAllocatorConfig::max_split_size() : key.size; auto it = pool.blocks.lower_bound(&key); if (it == pool.blocks.end() || (*it)->stream != p.stream() || @@ -2966,7 +3229,7 @@ class DeviceCachingAllocator { --it; // Back up one item. Now on the largest block for the correct // stream while ((totalReleased < key.size) && - ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) && + ((*it)->size >= CUDAAllocatorConfig::max_split_size()) && ((*it)->stream == p.stream())) { auto cur = it; bool is_first = cur == pool.blocks.begin(); @@ -2974,8 +3237,8 @@ class DeviceCachingAllocator { --it; } if (!(*cur)->expandable_segment_) { - release_block(*cur, context); totalReleased += (*cur)->size; + release_block(*cur, context); } if (is_first) { break; @@ -3091,7 +3354,7 @@ class DeviceCachingAllocator { stats.reserved_bytes[static_cast(StatType::AGGREGATE)] .current); - if (block->size >= AcceleratorAllocatorConfig::max_split_size()) + if (block->size >= CUDAAllocatorConfig::max_split_size()) stats.oversize_segments.decrease(1); pool->blocks.erase(block); delete block; @@ -3284,8 +3547,8 @@ class DeviceCachingAllocator { void insert_events_deferred_until_no_capture( const std::shared_ptr& context) { - if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) { - for (auto* block : needs_events_deferred_until_no_capture) { + if (C10_UNLIKELY(!deferred_blocks.empty())) { + for (auto& [block, inserted_empty_nodes] : deferred_blocks) { TORCH_INTERNAL_ASSERT(!block->stream_uses.empty()); // only streams recorded before cudagraph will be used to insert events // since we know all streams recorded during cudagraph must have @@ -3297,7 +3560,7 @@ class DeviceCachingAllocator { free_block(block, context); } } - needs_events_deferred_until_no_capture.clear(); + deferred_blocks.clear(); } } @@ -3718,8 +3981,8 @@ class NativeCachingAllocator : public CUDAAllocator { auto& md = result.config_metadata; md.garbage_collection_threshold = - AcceleratorAllocatorConfig::garbage_collection_threshold(); - md.max_split_size = AcceleratorAllocatorConfig::max_split_size(); + CUDAAllocatorConfig::garbage_collection_threshold(); + md.max_split_size = CUDAAllocatorConfig::max_split_size(); md.pinned_num_register_threads = CUDAAllocatorConfig::pinned_num_register_threads(); md.expandable_segments = CUDAAllocatorConfig::expandable_segments(); @@ -3727,10 +3990,11 @@ class NativeCachingAllocator : public CUDAAllocator { CUDAAllocatorConfig::release_lock_on_cudamalloc(); md.pinned_use_host_register = CUDAAllocatorConfig::pinned_use_cuda_host_register(); - md.last_allocator_settings = - AcceleratorAllocatorConfig::last_allocator_settings(); + md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings(); + md.graph_capture_record_stream_reuse = + CUDAAllocatorConfig::graph_capture_record_stream_reuse(); md.roundup_power2_divisions = - AcceleratorAllocatorConfig::roundup_power2_divisions(); + CUDAAllocatorConfig::roundup_power2_divisions(); return result; } @@ -4108,17 +4372,67 @@ CUDAAllocator* allocator(); } // namespace CudaMallocAsync struct BackendStaticInitializer { + // Parses env for backend at load time, duplicating some logic from + // CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at + // runtime). Defers verbose exceptions and error checks, including Cuda + // version checks, to CUDAAllocatorConfig's runtime doublecheck. If this + // works, maybe we should move all of CUDAAllocatorConfig here? CUDAAllocator* parseEnvForBackend() { - // If the environment variable is set, we use the CudaMallocAsync allocator. - if (CUDAAllocatorConfig::use_async_allocator()) { - return CudaMallocAsync::allocator(); + auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF"); +#ifdef USE_ROCM + // convenience for ROCm users to allow either CUDA or HIP env var + if (!val.has_value()) { + val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF"); + } +#endif + if (val.has_value()) { + const std::string& config = val.value(); + + std::regex exp("[\\s,]+"); + std::sregex_token_iterator it(config.begin(), config.end(), exp, -1); + std::sregex_token_iterator end; + std::vector options(it, end); + + for (auto option : options) { + std::regex exp2("[:]+"); + std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1); + std::sregex_token_iterator end2; + std::vector kv(it2, end2); + if (kv.size() >= 2) { + if (kv[0] == "backend") { +#ifdef USE_ROCM + // convenience for ROCm users to allow either CUDA or HIP env var + if (kv[1] == + "cud" + "aMallocAsync" || + kv[1] == "hipMallocAsync") +#else + if (kv[1] == "cudaMallocAsync") +#endif + return CudaMallocAsync::allocator(); + if (kv[1] == "native") + return &Native::allocator; + } + } + } } return &Native::allocator; } BackendStaticInitializer() { auto r = parseEnvForBackend(); +// Register this HIP allocator as the CUDA allocator to allow it to work +// with both c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA) +// APIs. We don't perform this masquerading inside +// HIPAllocatorMasqueradingAsCUDA because it needs to happen during static +// initialization, and doing so there may introduce static initialization +// order (SIOF) issues. +#define HIP_MASQUERADING_AS_CUDA \ + "cud" \ + "a" + at::SetAllocator(c10::Device(HIP_MASQUERADING_AS_CUDA).type(), r, 0); allocator.store(r); +#undef HIP_MASQUERADING_AS_CUDA } }; @@ -4145,11 +4459,8 @@ std::atomic MemPool::uuid_{1}; MemPool::MemPool( CUDACachingAllocator::CUDAAllocator* allocator, bool is_user_created, - bool use_on_oom, - bool symmetric) - : allocator_(allocator), - is_user_created_(is_user_created), - symmetric_(symmetric) { + bool use_on_oom) + : allocator_(allocator), is_user_created_(is_user_created) { if (is_user_created_) { id_ = {0, uid_++}; } else { @@ -4172,10 +4483,6 @@ MempoolId_t MemPool::id() { return id_; } -bool MemPool::is_symmetric() { - return symmetric_; -} - CUDACachingAllocator::CUDAAllocator* MemPool::allocator() { return allocator_; } diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index 956411fe22827..bfc486d69fcff 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include @@ -50,9 +49,10 @@ namespace c10::cuda::CUDACachingAllocator { // Preserved only for BC reasons // NOLINTNEXTLINE(misc-unused-using-decls) -using c10::CachingAllocator::kLargeBuffer; using c10::CachingDeviceAllocator::DeviceStats; +extern const size_t kLargeBuffer; + typedef std::shared_ptr (*CreateContextFn)(); // Struct containing info of an allocation block (i.e. a fractional part of a @@ -163,6 +163,7 @@ struct AllocatorConfigInfo { bool expandable_segments; bool release_lock_on_malloc; bool pinned_use_host_register; + bool graph_capture_record_stream_reuse; std::string last_allocator_settings; std::vector roundup_power2_divisions; }; @@ -202,25 +203,24 @@ struct ShareableHandle { std::string handle; }; -class CUDAAllocator : public Allocator { +class CUDAAllocator : public DeviceAllocator { public: virtual void* raw_alloc(size_t nbytes) = 0; virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0; virtual void raw_delete(void* ptr) = 0; virtual void init(int device_count) = 0; - virtual bool initialized() = 0; virtual double getMemoryFraction(c10::DeviceIndex device) = 0; virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0; - virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0; virtual void enable(bool value) = 0; virtual bool isEnabled() const = 0; virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0; virtual void* getBaseAllocation(void* ptr, size_t* size) = 0; - virtual void recordStream(const DataPtr&, CUDAStream stream) = 0; - virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats( - c10::DeviceIndex device) = 0; - virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0; - virtual void resetPeakStats(c10::DeviceIndex device) = 0; + // Keep for BC only + virtual void recordStream(const DataPtr& ptr, CUDAStream stream) = 0; + void recordStream(const DataPtr& ptr, c10::Stream stream) override { + CUDAStream cuda_stream = CUDAStream(stream); + recordStream(ptr, cuda_stream); + } virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0; virtual void beginAllocateToPool( c10::DeviceIndex device, @@ -525,6 +525,10 @@ inline void enablePeerAccess( namespace c10::cuda { +// Keep BC only +using c10::CaptureId_t; +using c10::MempoolId_t; + // MemPool represents a pool of memory in a caching allocator. Currently, // it's just the ID of the pool object maintained in the CUDACachingAllocator. // @@ -535,8 +539,7 @@ struct C10_CUDA_API MemPool { MemPool( CUDACachingAllocator::CUDAAllocator* allocator = nullptr, bool is_user_created = true, - bool use_on_oom = false, - bool symmetric = false); + bool use_on_oom = false); MemPool(const MemPool&) = delete; MemPool(MemPool&&) = default; MemPool& operator=(const MemPool&) = delete; @@ -544,7 +547,6 @@ struct C10_CUDA_API MemPool { ~MemPool(); MempoolId_t id(); - bool is_symmetric(); CUDACachingAllocator::CUDAAllocator* allocator(); int use_count(); c10::DeviceIndex device(); @@ -556,7 +558,6 @@ struct C10_CUDA_API MemPool { CUDACachingAllocator::CUDAAllocator* allocator_; bool is_user_created_; MempoolId_t id_; - bool symmetric_; c10::DeviceIndex device_; }; diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp index 0e8cabf618593..9839e4e72049e 100644 --- a/c10/cuda/CUDAFunctions.cpp +++ b/c10/cuda/CUDAFunctions.cpp @@ -53,13 +53,12 @@ int device_count_impl(bool fail_if_no_driver) { "https://pytorch.org to install a PyTorch version that has been " "compiled with your version of the CUDA driver."); } - } break; + } case cudaErrorInitializationError: TORCH_CHECK( false, "CUDA driver initialization failed, you might not " "have a CUDA gpu."); - break; case cudaErrorUnknown: TORCH_CHECK( false, @@ -67,7 +66,6 @@ int device_count_impl(bool fail_if_no_driver) { "incorrectly set up environment, e.g. changing env " "variable CUDA_VISIBLE_DEVICES after program start. " "Setting the available devices to be zero."); - break; #if C10_ASAN_ENABLED case cudaErrorMemoryAllocation: // In ASAN mode, we know that a cudaErrorMemoryAllocation error will @@ -80,6 +78,18 @@ int device_count_impl(bool fail_if_no_driver) { "would like to use GPUs, turn off ASAN."); break; #endif // C10_ASAN_ENABLED +#if _WIN32 && CUDA_VERSION >= 13000 + // Workaround for CUDA-13.0 error handling on Windows, see + // https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585 + case cudaErrorNotSupported: + if (!fail_if_no_driver) { + TORCH_WARN( + "cudaGetDeviceCount() returned cudaErrorNotSupported, " + "likely using older driver or on CPU machine"); + count = 0; + break; + } +#endif default: TORCH_CHECK( false, diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h index eb29ca8bc9f02..936875fd71d5c 100644 --- a/c10/cuda/CUDAGraphsC10Utils.h +++ b/c10/cuda/CUDAGraphsC10Utils.h @@ -9,12 +9,6 @@ namespace c10::cuda { -using CaptureId_t = unsigned long long; - -// first is set if the instance is created by CUDAGraph::capture_begin. -// second is set if the instance is created by at::cuda::graph_pool_handle. -using MempoolId_t = std::pair; - // RAII guard for "cudaStreamCaptureMode", a thread-local value // that controls the error-checking strictness of a capture. struct C10_CUDA_API CUDAStreamCaptureModeGuard { diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp index 0cde2d9de01cf..6d2b1e06fda9b 100644 --- a/c10/cuda/CUDAStream.cpp +++ b/c10/cuda/CUDAStream.cpp @@ -147,7 +147,7 @@ static inline StreamIdType streamIdType(StreamId s) { // rightmost bit int mask_for_type = (1 << kStreamTypeBits) - 1; auto val = (s >> 1) & mask_for_type; - TORCH_INTERNAL_ASSERT(val || !(s & 1), "invalid StreamId", s); + TORCH_CHECK(val || !(s & 1), "invalid StreamId", s); return StreamIdType(val); } @@ -216,9 +216,6 @@ static void initSingleStream(int p, DeviceIndex device_index, int i) { // Creates the low and high priority stream pools for the specified device // Warning: only call once per device! static void initDeviceStreamState(DeviceIndex device_index) { - // Switches to the requested device so streams are properly associated - // with it. - CUDAGuard device_guard{device_index}; for (const auto i : c10::irange(kStreamsPerPool)) { for (const auto p : c10::irange(max_stream_priorities)) { initSingleStream(p, device_index, i); @@ -279,7 +276,7 @@ cudaStream_t CUDAStream::stream() const { StreamIdType st = streamIdType(stream_id); size_t si = streamIdIndex(stream_id); if (st.isDefault()) { - TORCH_INTERNAL_ASSERT( + TORCH_CHECK( si == 0, "Unrecognized stream ", stream_, @@ -294,7 +291,7 @@ cudaStream_t CUDAStream::stream() const { return reinterpret_cast(stream_id); } else { auto streamType = st.getStreamType(); - TORCH_INTERNAL_ASSERT( + TORCH_CHECK( streamType >= 1 && streamType <= max_stream_priorities, "Unrecognized stream ", stream_, diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp index f4b62e53fcc00..d545bf5477b64 100644 --- a/c10/cuda/driver_api.cpp +++ b/c10/cuda/driver_api.cpp @@ -38,6 +38,13 @@ DriverAPI create_driver_api() { C10_NVML_DRIVER_API(LOOKUP_NVML_ENTRY) #undef LOOKUP_NVML_ENTRY } + + if (handle_1) { +#define LOOKUP_NVML_ENTRY_OPTIONAL(name) \ + r.name##_ = ((decltype(&name))dlsym(handle_1, #name)); + C10_NVML_DRIVER_API_OPTIONAL(LOOKUP_NVML_ENTRY_OPTIONAL) +#undef LOOKUP_NVML_ENTRY_OPTIONAL + } return r; } @@ -54,11 +61,14 @@ void* get_symbol(const char* name, int version) { } #endif + // As of CUDA 13, this API is deprecated. +#if defined(CUDA_VERSION) && (CUDA_VERSION < 13000) // This fallback to the old API to try getting the symbol again. if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres); st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) { return out; } +#endif // If the symbol cannot be resolved, report and return nullptr; // the caller is responsible for checking the pointer. diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h index 9800809d1e535..8910e581a1a4e 100644 --- a/c10/cuda/driver_api.h +++ b/c10/cuda/driver_api.h @@ -53,7 +53,8 @@ #define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \ _(cuMulticastAddDevice, 12030) \ _(cuMulticastBindMem, 12030) \ - _(cuMulticastCreate, 12030) + _(cuMulticastCreate, 12030) \ + _(cuMulticastUnbind, 12030) #else #define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) #endif @@ -66,6 +67,12 @@ _(nvmlDeviceGetComputeRunningProcesses) \ _(nvmlSystemGetCudaDriverVersion_v2) +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12040) +#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV) +#else +#define C10_NVML_DRIVER_API_OPTIONAL(_) +#endif + namespace c10::cuda { struct DriverAPI { @@ -74,6 +81,7 @@ struct DriverAPI { C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED) C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED) C10_NVML_DRIVER_API(CREATE_MEMBER) + C10_NVML_DRIVER_API_OPTIONAL(CREATE_MEMBER) #undef CREATE_MEMBER_VERSIONED #undef CREATE_MEMBER diff --git a/c10/metal/atomic.h b/c10/metal/atomic.h index 6dcd9a706ba74..d0cbc03916989 100644 --- a/c10/metal/atomic.h +++ b/c10/metal/atomic.h @@ -124,5 +124,54 @@ struct AtomicType { } }; +// ComplexHalf atomic op +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, half2 value) { + auto ptr = data + offset; + auto old = + ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed); + while (!::metal::atomic_compare_exchange_weak_explicit( + ptr, + &old, + as_type(as_type(old) + value), + ::metal::memory_order_relaxed, + ::metal::memory_order_relaxed)) + ; + } +}; + +// There are no atomic 64-bit add in Metal yet, but templates below implements a +// consistent add I.e. if multiple threads are modify the same 64-bit value, +// results stored at the address will eventually be equal to its original value +// plus sum of all operands +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, long value) { + const auto value_bits = as_type(value); + const uint low = static_cast(value_bits); + uint high = static_cast(value_bits >> 32); + auto ptr = data + (offset << 1); + auto old_low = + atomic_fetch_add_explicit(ptr, low, ::metal::memory_order_relaxed); + high += (old_low + low < old_low) ? 1 : 0; + atomic_fetch_add_explicit(ptr + 1, high, ::metal::memory_order_relaxed); + } +}; + +// ComplexFloat atomic op, which again is not really atomic, but eventually +// consistent +template <> +struct AtomicType { + using type = ::metal::atomic; + static inline void atomic_add(device type* data, long offset, float2 value) { + auto ptr = data + (offset << 1); + atomic_fetch_add_explicit(ptr + 0, value.x, ::metal::memory_order_relaxed); + atomic_fetch_add_explicit(ptr + 1, value.y, ::metal::memory_order_relaxed); + } +}; + } // namespace metal } // namespace c10 diff --git a/c10/metal/igamma.h b/c10/metal/igamma.h new file mode 100644 index 0000000000000..8dabdbbb621c9 --- /dev/null +++ b/c10/metal/igamma.h @@ -0,0 +1,744 @@ +#pragma once + +#include +#include +#include + +using namespace c10::metal; +using namespace metal; + +namespace c10 { +namespace metal { + +template +inline float log_gamma(const T); + +inline float expm1f(float a); + +template +float erfc(T x); + +} // namespace metal +} // namespace c10 + +namespace { + +template +inline float lgamma(const T a) { + return log_gamma(a); +} + +inline float expm1(float a) { + return expm1f(a); +} + +// NOTE: The following code was ported directly from the CUDA implementation in +// `aten/src/ATen/native/cuda/IGammaKernel.cu` + +/* + * This implementation of the regularized incomplete gamma functions and + * their helper functions are derived from the implementation of SciPy's + * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations. + * See NOTICE for the licenses. + */ +// regularized lower & upper incomplete gamma +template +scalar_t ratevl( + scalar_t x, + const scalar_t num[], + int64_t M, + const scalar_t denom[], + int64_t N) { + // evaluating rational function, i.e., the ratio of two polynomials + // the coefficients for numerator are given by `num` while coeffs for + // denumerator are given by `denom` + + using accscalar_t = opmath_t; + int64_t i, dir; + accscalar_t y, num_ans, denom_ans; + accscalar_t absx = ::fabs(x); + thread const accscalar_t* p; + + if (absx > 1) { + /* Evaluate as a polynomial in 1/x. */ + dir = -1; + p = num + M; + y = 1 / x; + } else { + dir = 1; + p = num; + y = x; + } + + /* Evaluate the numerator */ + num_ans = *p; + p += dir; + for (i = 1; i <= M; i++) { + num_ans = num_ans * y + *p; + p += dir; + } + /* Evaluate the denominator */ + if (absx > 1) { + p = denom + N; + } else { + p = denom; + } + + denom_ans = *p; + p += dir; + for (i = 1; i <= N; i++) { + denom_ans = denom_ans * y + *p; + p += dir; + } + if (absx > 1) { + i = N - M; + return ::pow(x, static_cast(i)) * num_ans / denom_ans; + } else { + return num_ans / denom_ans; + } +} + +template +scalar_t lanczos_sum_expg_scaled(scalar_t x) { + // lanczos approximation + using accscalar_t = opmath_t; + + const accscalar_t lanczos_sum_expg_scaled_num[13] = { + 0.006061842346248906525783753964555936883222, + 0.5098416655656676188125178644804694509993, + 19.51992788247617482847860966235652136208, + 449.9445569063168119446858607650988409623, + 6955.999602515376140356310115515198987526, + 75999.29304014542649875303443598909137092, + 601859.6171681098786670226533699352302507, + 3481712.15498064590882071018964774556468, + 14605578.08768506808414169982791359218571, + 43338889.32467613834773723740590533316085, + 86363131.28813859145546927288977868422342, + 103794043.1163445451906271053616070238554, + 56906521.91347156388090791033559122686859}; + const accscalar_t lanczos_sum_expg_scaled_denom[13] = { + 1., + 66., + 1925., + 32670., + 357423., + 2637558., + 13339535., + 45995730., + 105258076., + 150917976., + 120543840., + 39916800., + 0}; + return ratevl( + static_cast(x), + lanczos_sum_expg_scaled_num, + sizeof(lanczos_sum_expg_scaled_num) / + sizeof(lanczos_sum_expg_scaled_num[0]) - + 1, + lanczos_sum_expg_scaled_denom, + sizeof(lanczos_sum_expg_scaled_denom) / + sizeof(lanczos_sum_expg_scaled_denom[0]) - + 1); +} + +template +scalar_t _igam_helper_fac(scalar_t a, scalar_t x) { + // compute x^a * exp(-a) / gamma(a) + // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with + // exp(a - x). + + using accscalar_t = opmath_t; + accscalar_t ax, fac, res, num, numfac; + const accscalar_t MAXLOG = 88.72283905206835; + const accscalar_t EXP1 = 2.718281828459045; + const accscalar_t lanczos_g = 6.024680040776729583740234375; + + if (::fabs(a - x) > 0.4 * ::fabs(a)) { + ax = a * ::log(x) - x - ::lgamma(a); + if (ax < -MAXLOG) { + return 0.0; + } + return ::exp(ax); + } + + fac = a + lanczos_g - 0.5; + res = ::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a); + + if ((a < 200) && (x < 200)) { + res *= ::exp(a - x) * ::pow(x / fac, a); + } else { + num = x - a - lanczos_g + 0.5; + numfac = num / fac; + res *= ::exp(a * (::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac); + } + return res; +} + +template +scalar_t _igam_helper_series(scalar_t a, scalar_t x) { + // Compute igam using DLMF 8.11.4. [igam1] + + using accscalar_t = opmath_t; + const accscalar_t MACHEP = 5.9604644775390625E-8; + const int MAXITER = 2000; + + int i; + accscalar_t ans, ax, c, r; + + ax = _igam_helper_fac(a, x); + if (ax == 0.0) { + return 0.0; + } + + /* power series */ + r = a; + c = 1.0; + ans = 1.0; + + for (i = 0; i < MAXITER; i++) { + r += 1.0; + c *= x / r; + ans += c; + if (c <= MACHEP * ans) { + break; + } + } + return (ans * ax / a); +} + +template +scalar_t _igamc_helper_series(scalar_t a, scalar_t x) { + // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in + // _igam_helper_series but extra care is taken to avoid cancellation. + + using accscalar_t = opmath_t; + int n; + accscalar_t fac = 1; + accscalar_t sum = 0; + accscalar_t term, logx; + const int MAXITER = 2000; + const accscalar_t MACHEP = 5.9604644775390625E-8; + + for (n = 1; n < MAXITER; n++) { + fac *= -x / n; + term = fac / (a + n); + sum += term; + if (::fabs(term) <= MACHEP * ::fabs(sum)) { + break; + } + } + + logx = ::log(x); + term = -::expm1(a * logx - ::lgamma(1 + a)); + return term - ::exp(a * logx - ::lgamma(a)) * sum; +} + +template +scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) { + // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1] + + using accscalar_t = opmath_t; + const accscalar_t d[25][25] = { + {-3.3333333333333333e-1, 8.3333333333333333e-2, + -1.4814814814814815e-2, 1.1574074074074074e-3, + 3.527336860670194e-4, -1.7875514403292181e-4, + 3.9192631785224378e-5, -2.1854485106799922e-6, + -1.85406221071516e-6, 8.296711340953086e-7, + -1.7665952736826079e-7, 6.7078535434014986e-9, + 1.0261809784240308e-8, -4.3820360184533532e-9, + 9.1476995822367902e-10, -2.551419399494625e-11, + -5.8307721325504251e-11, 2.4361948020667416e-11, + -5.0276692801141756e-12, 1.1004392031956135e-13, + 3.3717632624009854e-13, -1.3923887224181621e-13, + 2.8534893807047443e-14, -5.1391118342425726e-16, + -1.9752288294349443e-15}, + {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, + -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, + -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, + 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, + 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, + 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, + 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, + -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, + -4.13125571381061e-15}, + {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, + 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, + -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, + -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, + -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, + 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, + 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, + 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, + 8.8592218725911273e-15}, + {6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4, + 2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7, + 1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6, + -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8, + -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9, + -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14, + -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12, + 6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14, + 2.0453671226782849e-14}, + {-8.618882909167117e-4, 7.8403922172006663e-4, + -2.9907248030319018e-4, -1.4638452578843418e-6, + 6.6414982154651222e-5, -3.9683650471794347e-5, + 1.1375726970678419e-5, 2.5074972262375328e-10, + -1.6954149536558306e-6, 8.9075075322053097e-7, + -2.2929348340008049e-7, 2.956794137544049e-11, + 2.8865829742708784e-8, -1.4189739437803219e-8, + 3.4463580499464897e-9, -2.3024517174528067e-13, + -3.9409233028046405e-10, 1.8602338968504502e-10, + -4.356323005056618e-11, 1.2786001016296231e-15, + 4.6792750266579195e-12, -2.1492464706134829e-12, + 4.9088156148096522e-13, -6.3385914848915603e-18, + -5.0453320690800944e-14}, + {-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4, + -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7, + -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6, + -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7, + 4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9, + 3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15, + 9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11, + -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13, + -1.3249659916340829e-13}, + {5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4, + 7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5, + -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6, + -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13, + -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8, + 8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10, + 2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11, + 1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18, + 3.6902800842763467e-13}, + {3.4436760689237767e-4, 5.1717909082605922e-5, + -3.3493161081142236e-4, 2.812695154763237e-4, + -1.0976582244684731e-4, -1.2741009095484485e-7, + 2.7744451511563644e-5, -1.8263488805711333e-5, + 5.7876949497350524e-6, 4.9387589339362704e-10, + -1.0595367014026043e-6, 6.1667143761104075e-7, + -1.7562973359060462e-7, -1.2974473287015439e-12, + 2.695423606288966e-8, -1.4578352908731271e-8, + 3.887645959386175e-9, -3.8810022510194121e-17, + -5.3279941738772867e-10, 2.7437977643314845e-10, + -6.9957960920705679e-11, 2.5899863874868481e-17, + 8.8566890996696381e-12, -4.403168815871311e-12, + 1.0865561947091654e-12}, + {-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4, + -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4, + 4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5, + 6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11, + 3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8, + 6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9, + -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10, + -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18, + -3.3721464474854592e-12}, + {-5.9676129019274625e-4, -7.2048954160200106e-5, + 6.7823088376673284e-4, -6.4014752602627585e-4, + 2.7750107634328704e-4, 1.8197008380465151e-7, + -8.4795071170685032e-5, 6.105192082501531e-5, + -2.1073920183404862e-5, -8.8585890141255994e-10, + 4.5284535953805377e-6, -2.8427815022504408e-6, + 8.7082341778646412e-7, 3.6886101871706965e-12, + -1.5344695190702061e-7, 8.862466778790695e-8, + -2.5184812301826817e-8, -1.0225912098215092e-14, + 3.8969470758154777e-9, -2.1267304792235635e-9, + 5.7370135528051385e-10, -1.887749850169741e-19, + -8.0931538694657866e-11, 4.2382723283449199e-11, + -1.1002224534207726e-11}, + {1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3, + 9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4, + -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5, + -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11, + -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7, + -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8, + 1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9, + 9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18, + 3.7647749553543836e-11}, + {1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3, + 2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7, + 3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4, + 2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5, + -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6, + -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14, + -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9, + -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10, + 1.3481607129399749e-10}, + {-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3, + -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3, + 8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4, + 1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10, + 1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6, + 7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7, + -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8, + -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20, + -5.0423112718105824e-10}, + {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3, + -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6, + -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4, + -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4, + 4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5, + 6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13, + 3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8, + 8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9, + -1.9661464453856102e-9}, + {1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2, + 7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2, + -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3, + -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10, + -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5, + -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6, + 1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7, + 1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17, + 7.9795091026746235e-9}, + {3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2, + 5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6, + 1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3, + 3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3, + -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4, + -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12, + -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6, + -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7, + 3.3654425209171788e-8}, + {-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1, + -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2, + 4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2, + 1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9, + 1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4, + 1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5, + -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6, + -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16, + -1.4729737374018841e-7}, + {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1, + -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5, + -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2, + -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2, + 5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3, + 1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12, + 8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5, + 3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6, + -6.6812849447625594e-7}, + {7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968, + 1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1, + -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1, + -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8, + -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3, + -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3, + 3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5, + 5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14, + 3.1369106244517615e-6}, + {1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906, + 4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4, + 1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1, + 1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1, + -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2, + -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11, + -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4, + 9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5, + 1.5227271505597605e-5}, + {-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1, + -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1, + 5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816, + 2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7, + 3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1, + 8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2, + -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3, + -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11, + -7.6340103696869031e-5}, + {-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1, + -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3, + -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1, + -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195, + 1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1, + 3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10, + 3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3, + -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3, + -3.9479941246822517e-4}, + {7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2, + 1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2, + -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1, + -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7, + -6.2716159907747034, 5.1168999071852637, -2.0319658112299095, + -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1, + 1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2, + 2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6, + 2.1250180774699461e-3}, + {2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2, + 7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2, + 3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2, + 1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1, + -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373, + -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7, + -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1, + 1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2, + 1.5109265210467774e-2}, + {-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3, + -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3, + 1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2, + 7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6, + 1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1, + -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1, + -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468, + -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1, + 4.8683443692930507e-1}}; + + int k, n, sgn; + int maxpow = 0; + const accscalar_t MACHEP = 5.9604644775390625E-8; + accscalar_t lambda = x / a; + accscalar_t sigma = (x - a) / a; + accscalar_t eta, res, ck, ckterm, term, absterm; + accscalar_t absoldterm = INFINITY; + accscalar_t etapow[25] = {1}; + accscalar_t sum = 0; + accscalar_t afac = 1; + + if (igam) { + sgn = -1; + } else { + sgn = 1; + } + + if (lambda > 1) { + eta = ::sqrt(-2 * (::log1p(sigma) - sigma)); + } else if (lambda < 1) { + eta = -::sqrt(-2 * (::log1p(sigma) - sigma)); + } else { + eta = 0; + } + res = 0.5 * ::erfc(sgn * eta * ::sqrt(a / 2)); + + for (k = 0; k < 25; k++) { + ck = d[k][0]; + for (n = 1; n < 25; n++) { + if (n > maxpow) { + etapow[n] = eta * etapow[n - 1]; + maxpow += 1; + } + ckterm = d[k][n] * etapow[n]; + ck += ckterm; + if (::fabs(ckterm) < MACHEP * ::fabs(ck)) { + break; + } + } + term = ck * afac; + absterm = ::fabs(term); + if (absterm > absoldterm) { + break; + } + sum += term; + if (absterm < MACHEP * ::fabs(sum)) { + break; + } + absoldterm = absterm; + afac /= a; + } + res += sgn * ::exp(-0.5 * a * eta * eta) * sum / ::sqrt(2 * 3.1415926535 * a); + + return res; +} + +template +scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) { + // Compute igamc using DLMF 8.9.2. [igam1] + + using accscalar_t = opmath_t; + int i; + accscalar_t ans, ax, c, yc, r, t, y, z; + accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2; + const int MAXITER = 2000; + const accscalar_t MACHEP = 5.9604644775390625E-8; + const accscalar_t BIG = 16777216.; + const accscalar_t BIGINV = 5.9604644775390625E-8; + + ax = _igam_helper_fac(a, x); + if (ax == 0.0) { + return 0.0; + } + + /* continued fraction */ + y = 1.0 - a; + z = x + y + 1.0; + c = 0.0; + pkm2 = 1.0; + qkm2 = x; + pkm1 = x + 1.0; + qkm1 = z * x; + ans = pkm1 / qkm1; + + for (i = 0; i < MAXITER; i++) { + c += 1.0; + y += 1.0; + z += 2.0; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != 0) { + r = pk / qk; + t = ::fabs((ans - r) / r); + ans = r; + } else { + t = 1.0; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (::fabs(pk) > BIG) { + pkm2 *= BIGINV; + pkm1 *= BIGINV; + qkm2 *= BIGINV; + qkm1 *= BIGINV; + } + if (t <= MACHEP) { + break; + } + } + return ans * ax; +} + +template +scalar_t calc_igammac(scalar_t a, scalar_t x) { + /* the calculation of the regularized upper incomplete gamma function + * is done differently based on the values of a and x: + * - if x and/or a is at the boundary of defined region, then assign the + * result at the boundary + * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for + * Large Parameter (see DLMF 8.12.4 [igam1]) + * - if x > 1.1 and x < a, using the subtraction from the regularized lower + * incomplete gamma + * - otherwise, calculate the series from [igam2] eq (5) + */ + + using accscalar_t = opmath_t; + accscalar_t absxma_a; + + const accscalar_t SMALL = 20.0; + const accscalar_t LARGE = 200.0; + const accscalar_t SMALLRATIO = 0.3; + const accscalar_t LARGERATIO = 4.5; + + if ((x < 0) || (a < 0)) { + // out of defined-region of the function + return NAN; + } else if (a == 0) { + if (x > 0) { + return 0.0; + } else { + return NAN; + } + } else if (x == 0) { + return 1.0; + } else if (isinf(a)) { + if (isinf(x)) { + return NAN; + } + return 1.0; + } else if (isinf(x)) { + return 0.0; + } + + absxma_a = ::fabs(x - a) / a; + if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) { + return _igam_helper_asymptotic_series(a, x, 0); + } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) { + return _igam_helper_asymptotic_series(a, x, 0); + } + + if (x > 1.1) { + if (x < a) { + return 1.0 - _igam_helper_series(a, x); + } else { + return _igamc_helper_continued_fraction(a, x); + } + } else if (x <= 0.5) { + if (-0.4 / ::log(x) < a) { + return 1.0 - _igam_helper_series(a, x); + } else { + return _igamc_helper_series(a, x); + } + } else { + if (x * 1.1 < a) { + return 1.0 - _igam_helper_series(a, x); + } else { + return _igamc_helper_series(a, x); + } + } +} + +template +scalar_t calc_igamma(scalar_t a, scalar_t x) { + /* the calculation of the regularized lower incomplete gamma function + * is done differently based on the values of a and x: + * - if x and/or a is at the boundary of defined region, then assign the + * result at the boundary + * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for + * Large Parameter (see DLMF 8.12.3 [igam1]) + * - if x > 1 and x > a, using the subtraction from the regularized upper + * incomplete gamma + * - otherwise, calculate the series from [igam2] eq (4) + */ + + using accscalar_t = opmath_t; + accscalar_t absxma_a; + const accscalar_t SMALL = 20.0; + const accscalar_t LARGE = 200.0; + const accscalar_t SMALLRATIO = 0.3; + const accscalar_t LARGERATIO = 4.5; + + // boundary values following SciPy + if ((x < 0) || (a < 0)) { + // out of defined-region of the function + return NAN; + } else if (a == 0) { + if (x > 0) { + return 1.0; + } else { + return NAN; + } + } else if (x == 0) { + return 0.0; // zero integration limit + } else if (isinf(a)) { + if (isinf(x)) { + return NAN; + } + return 0.0; + } else if (isinf(x)) { + return 1.0; + } + + /* Asymptotic regime where a ~ x. */ + absxma_a = ::fabs(x - a) / a; + if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) { + return _igam_helper_asymptotic_series(a, x, 1); + } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) { + return _igam_helper_asymptotic_series(a, x, 1); + } + + if ((x > 1.0) && (x > a)) { + return 1.0 - calc_igammac(a, x); + } + + return _igam_helper_series(a, x); +} + +} // namespace + +// end of regularized lower & upper incomplete gamma + +namespace c10 { +namespace metal { + +template +inline T igamma(T a, T b) { + return calc_igamma(a, b); +} + +template +inline T igammac(T a, T b) { + return calc_igammac(a, b); +} + +} // namespace metal +} // namespace c10 diff --git a/c10/metal/special_math.h b/c10/metal/special_math.h index 34f6ab6d1d09e..29a45ff4c30b6 100644 --- a/c10/metal/special_math.h +++ b/c10/metal/special_math.h @@ -1,6 +1,7 @@ // Implementation of specal math functions for Metal #pragma once #include +#include #include #include @@ -47,6 +48,11 @@ inline float erf(T x) { return r; } +template +float erfc(T x) { + return 1.0 - erf(x); +} + template inline float erfinv(T y) { /* coefficients in rational expansion */ diff --git a/c10/test/build.bzl b/c10/test/build.bzl index 2f54c8a2faa5b..deb917dd8fcf3 100644 --- a/c10/test/build.bzl +++ b/c10/test/build.bzl @@ -46,7 +46,7 @@ def define_targets(rules): "util/typeid_test.cpp", ], ), - copts = ["-Wno-deprecated-declarations"], + copts = ["-Wno-deprecated-declarations", "-Wno-ctad-maybe-unsupported"], deps = [ ":Macros", ":complex_math_test_common", diff --git a/c10/test/core/SymInt_test.cpp b/c10/test/core/SymInt_test.cpp index 7cefa1e4a771b..e408543f5362c 100644 --- a/c10/test/core/SymInt_test.cpp +++ b/c10/test/core/SymInt_test.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -35,4 +36,169 @@ TEST(SymIntTest, Overflows) { } #endif +namespace { + +// We need a SymNodeImpl that 1) has working arithmetic with +// predictable results and 2) causes SymInt::maybe_as_int to return +// nullopt so that we can hit all 4 cases (zero/one/both arguments +// have null maybe_as_int) in the operator implementations. +class ConstantIntPretendingToBeSymbolicSymNodeImpl + : public ConstantSymNodeImpl { + public: + using ConstantSymNodeImpl::ConstantSymNodeImpl; + std::optional constant_int() override { + return std::nullopt; + } + std::optional maybe_as_int() override { + return std::nullopt; + } + // Needs to be implemented for arithmetic to actually + // work. NestedIntSymNodeImpl does this, for example. + c10::SymNode wrap_int(int64_t num) override { + return SymNode( + c10::make_intrusive(num)); + } + + c10::SymNode wrap_bool(bool b) override { + return SymNode(c10::make_intrusive>(b)); + } + + SymNode add(const SymNode& other) override { + return wrap_int(int_() + other->int_()); + } + + SymNode sub(const SymNode& other) override { + return wrap_int(int_() - other->int_()); + } + + SymNode mul(const SymNode& other) override { + return wrap_int(int_() * other->int_()); + } + + SymNode floordiv(const SymNode& other) override { + return wrap_int(int_() / other->int_()); + } + + SymNode sym_min(const SymNode& other) override { + return wrap_int(std::min(int_(), other->int_())); + } + + SymNode sym_max(const SymNode& other) override { + return wrap_int(std::max(int_(), other->int_())); + } + + SymNode mod(const SymNode& other) override { + return wrap_int(int_() % other->int_()); + } + + SymNode eq(const SymNode& other) override { + return wrap_bool(int_() == other->int_()); + } + + SymNode ne(const SymNode& other) override { + return wrap_bool(int_() != other->int_()); + } + + SymNode lt(const SymNode& other) override { + return wrap_bool(int_() < other->int_()); + } + + SymNode le(const SymNode& other) override { + return wrap_bool(int_() <= other->int_()); + } + + SymNode gt(const SymNode& other) override { + return wrap_bool(int_() > other->int_()); + } + + SymNode ge(const SymNode& other) override { + return wrap_bool(int_() >= other->int_()); + } +}; + +SymInt create_symbolic_symint(int64_t value) { + return SymInt( + SymNode(c10::make_intrusive( + value))); +} + +auto unwrap(const SymInt& x) { + return x.guard_int(__FILE__, __LINE__); +} + +auto unwrap(bool b) { + return b; +} + +template